#include #include #include #include #define MAX_LEN_TUPLE 12 #define MAX_LEN_WORD 12 #define MAX_DIST 0 #define NUM_STRAND 1 #define BIAS 1 #define MAX_EVALUE 1.0 #define NUM_NUCLEOTIDE 4 #define min(a,b) ((a < b) ? a : b) #define max(a,b) ((a > b) ? a : b) #define nucleotide(c) ((c=='a')?0:((c=='c')?1:((c=='g')?2:((c=='t')?3:-1)))) typedef struct dist_node_type dist_node; struct dist_node_type { int sequence_num, num_dist; dist_node *next [NUM_NUCLEOTIDE]; }; typedef struct motif_node_type motif_node; struct motif_node_type { int len_tuple, dist; long double evalue; char *str; motif_node *next; }; char nucleotide_char [] = { /* 0 */ 'a', /* 1 */ 'c', /* 2 */ 'g', /* 3 */ 't' }; char *fmakeword(FILE *f, char stop, int *cl) { int wsize; char *word; int ll; wsize = 102400; ll=0; word = (char *) malloc(sizeof(char) * (wsize + 2)); while(1) { word[ll] = (char)fgetc(f); if(ll==wsize) { word[ll+1] = '\0'; wsize+=102400; word = (char *)realloc(word,sizeof(char)*(wsize+2)); } if (cl != NULL) --(*cl); if((word[ll] == stop) || (feof(f)) || ((cl != NULL) && (!(*cl)))) { if((word[ll] != stop) && (!feof(f)) && ((cl == NULL) || (!(*cl)))) ll++; word[ll] = '\0'; return word; } ++ll; } } char complement (char ch) { if (ch == 'a') return 't'; if (ch == 'A') return 'T'; if (ch == 'c') return 'g'; if (ch == 'C') return 'G'; if (ch == 'g') return 'c'; if (ch == 'G') return 'C'; if (ch == 't') return 'a'; if (ch == 'T') return 'A'; return ch; } char **read_fasta (char *str, char ***sequence_name, int *num_sequence) { int i, n, start, end, len_sequence; char **sequence; n = strlen (str); (*num_sequence) = 0; i = 0; while (i < n) { while ((i < n) && (str [i] != '>')) i++; while ((i < n) && (str [i] != '\n')) i++; while ( (i < n) && (str [i] != '>') && (!islower (str [i])) && (!isupper (str [i])) ) i++; if ((i < n) && (str [i] != '>')) (*num_sequence)++; } sequence = (char **) malloc ((*num_sequence) * sizeof (char *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for sequence\n"); exit (-1); } (*sequence_name) = (char **) malloc ((*num_sequence) * sizeof (char *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for sequence_name\n"); exit (-1); } (*num_sequence) = 0; i = 0; while ((i < n) && (str [i] != '>')) i++; while (i < n) { start = i + 1; while ((i < n) && (str [i] != '\n')) i++; end = i - 1; while ( (i < n) && (str [i] != '>') && (!islower (str [i])) && (!isupper (str [i])) ) i++; if ((i < n) && (str [i] != '>')) { (*sequence_name) [*num_sequence] = (char *) malloc ((end - start + 2) * sizeof (char)); if (errno == ENOMEM) { printf ("Error: Not enough memory for sequence_name\n"); exit (-1); } strncpy ((*sequence_name) [*num_sequence], &str [start], end - start + 1); (*sequence_name) [*num_sequence][end - start + 1] = '\0'; start = i; len_sequence = 0; while ((i < n) && (str [i] != '>')) { if ((islower (str [i])) || (isupper (str [i]))) len_sequence++; i++; } sequence [*num_sequence] = (char *) malloc ((len_sequence + 1) * sizeof (char)); if (errno == ENOMEM) { printf ("Error: Not enough memory for sequence\n"); exit (-1); } i = start; len_sequence = 0; while ((i < n) && (str [i] != '>')) { if ((islower (str [i])) || (isupper (str [i]))) { sequence [*num_sequence][len_sequence] = str [i]; len_sequence++; } i++; } sequence [*num_sequence][len_sequence] = '\0'; (*num_sequence)++; } } return sequence; } void compute_motif0 (long double max_evalue, int len_tuple, int len_word, int len_word2, long double choose, long double ***prob, int *pos, dist_node *dist, char *str, int num_at, motif_node **motif) { int i; motif_node *current_motif; if (len_word == len_word2) { if (choose * prob [num_at][0][dist -> num_dist] < max_evalue) { current_motif = (motif_node *) malloc (sizeof (motif_node)); if (errno == ENOMEM) { printf ("Error: Not enough memory for current_motif\n"); exit (-1); } current_motif -> len_tuple = len_tuple; current_motif -> dist = 0; current_motif -> evalue = choose * prob [num_at][0][dist -> num_dist]; current_motif -> str = (char *) malloc (len_tuple * sizeof (char)); if (errno == ENOMEM) { printf ("Error: Not enough memory for str\n"); exit (-1); } strncpy (current_motif -> str, str, len_tuple); current_motif -> next = (*motif); (*motif) = current_motif; } } else { for (i = 0; i < NUM_NUCLEOTIDE; i++) { if (dist -> next [i] != NULL) { str [pos [len_word2]] = nucleotide_char [i]; if ((nucleotide_char [i] == 'a') || (nucleotide_char [i] == 't')) compute_motif0 (max_evalue, len_tuple, len_word, len_word2 + 1, choose, prob, pos, dist -> next [i], str, num_at + 1, motif); else compute_motif0 (max_evalue, len_tuple, len_word, len_word2 + 1, choose, prob, pos, dist -> next [i], str, num_at, motif); } } } free (dist); } void compute_motif1 (int max_dist, long double max_evalue, int num_sequence, int len_tuple, int len_word, int len_word2, long double choose, long double ***prob, int *pos, int index, int ***num_dist, int num_at, int *index2, motif_node **motif) { int i, total_dist, dist, index3; long double pvalue; motif_node *current_motif; if (len_word == len_word2) { pvalue = 1.0; total_dist = 0; for (i = 0; i <= min (max_dist, len_word - 1); i++) { if (total_dist == num_sequence) break; total_dist += num_dist [i][(*index2) / index][(*index2) % index]; if (pvalue > prob [num_at][i][total_dist]) { dist = i; pvalue = prob [num_at][i][total_dist]; } } if (choose * pvalue < max_evalue) { current_motif = (motif_node *) malloc (sizeof (motif_node)); if (errno == ENOMEM) { printf ("Error: Not enough memory for current_motif\n"); exit (-1); } current_motif -> len_tuple = len_tuple; current_motif -> dist = dist; current_motif -> evalue = choose * pvalue; current_motif -> str = (char *) malloc (len_tuple * sizeof (char)); if (errno == ENOMEM) { printf ("Error: Not enough memory for str\n"); exit (-1); } for (i = 0; i < len_tuple; i++) current_motif -> str [i] = ' '; index3 = (*index2); for (i = 0; i < len_word; i++) { current_motif -> str [pos [len_word - i - 1]] = nucleotide_char [index3 % NUM_NUCLEOTIDE]; index3 /= NUM_NUCLEOTIDE; } current_motif -> next = (*motif); (*motif) = current_motif; } (*index2)++; } else { for (i = 0; i < NUM_NUCLEOTIDE; i++) { if ((nucleotide_char [i] == 'a') || (nucleotide_char [i] == 't')) compute_motif1 (max_dist, max_evalue, num_sequence, len_tuple, len_word, len_word2 + 1, choose, prob, pos, index, num_dist, num_at + 1, index2, motif); else compute_motif1 (max_dist, max_evalue, num_sequence, len_tuple, len_word, len_word2 + 1, choose, prob, pos, index, num_dist, num_at, index2, motif); } } } void compute_motif2 (int max_dist, int num_strand, long double max_evalue, int num_sequence, char **sequence, int len_tuple, int len_word, int len_word2, long double choose, long double ***prob, int *pos, int pos2, motif_node **motif) { int i, j, k, l, p, q, len_sequence, found, index, index2, index3, index4, **dist2, ***num_dist; char ch, *str; dist_node *dist, *current_dist; if (len_word <= len_word2 + 1) { if (max_dist == 0) { dist = (dist_node *) malloc (sizeof (dist_node)); if (errno == ENOMEM) { printf ("Error: Not enough memory for dist\n"); exit (-1); } for (i = 0; i < NUM_NUCLEOTIDE; i++) dist -> next [i] = NULL; for (i = 0; i < num_sequence; i++) { len_sequence = strlen (sequence [i]); for (j = 0; j < len_sequence - len_tuple + 1; j++) { for (k = 0; k < num_strand; k++) { found = 0; for (l = 0; l < len_word; l++) { if (k == 0) ch = sequence [i][j + pos [l]]; else ch = complement (sequence [i][j + len_tuple - pos [l] - 1]); if (nucleotide (tolower (ch)) < 0) { found = 1; break; } } if (!found) { current_dist = dist; for (l = 0; l < len_word; l++) { if (k == 0) ch = sequence [i][j + pos [l]]; else ch = complement (sequence [i][j + len_tuple - pos [l] - 1]); if (current_dist -> next [nucleotide (tolower (ch))] == NULL) { current_dist -> next [nucleotide (tolower (ch))] = (dist_node *) malloc (sizeof (dist_node)); if (errno == ENOMEM) { printf ("Error: Not enough memory for next\n"); exit (-1); } if (l < len_word - 1) { for (p = 0; p < NUM_NUCLEOTIDE; p++) current_dist -> next [nucleotide (tolower (ch))] -> next [p] = NULL; } else { current_dist -> next [nucleotide (tolower (ch))] -> sequence_num = -1; current_dist -> next [nucleotide (tolower (ch))] -> num_dist = 0; } } current_dist = current_dist -> next [nucleotide (tolower (ch))]; } if (i > current_dist -> sequence_num) { current_dist -> sequence_num = i; current_dist -> num_dist++; } } } } } str = (char *) malloc (len_tuple * sizeof (char)); if (errno == ENOMEM) { printf ("Error: Not enough memory for str\n"); exit (-1); } for (i = 0; i < len_tuple; i++) str [i] = ' '; compute_motif0 (max_evalue, len_tuple, len_word, 0, choose, prob, pos, dist, str, 0, motif); free (str); } else { index = 1; for (i = 0; i < len_word - 10; i++) index *= NUM_NUCLEOTIDE; index2 = 1; for (i = 0; i < min (len_word, 10); i++) index2 *= NUM_NUCLEOTIDE; dist2 = (int **) malloc (index * sizeof (int *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for dist2\n"); exit (-1); } for (i = 0; i < index; i++) { dist2 [i] = (int *) malloc (index2 * sizeof (int)); if (errno == ENOMEM) { printf ("Error: Not enough memory for dist2\n"); exit (-1); } } num_dist = (int ***) malloc ((min (max_dist, len_word - 1) + 1) * sizeof (int **)); if (errno == ENOMEM) { printf ("Error: Not enough memory for num_dist\n"); exit (-1); } for (i = 0; i <= min (max_dist, len_word - 1); i++) { num_dist [i] = (int **) malloc (index * sizeof (int *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for num_dist\n"); exit (-1); } for (j = 0; j < index; j++) { num_dist [i][j] = (int *) malloc (index2 * sizeof (int)); if (errno == ENOMEM) { printf ("Error: Not enough memory for num_dist\n"); exit (-1); } for (k = 0; k < index2; k++) num_dist [i][j][k] = 0; } } for (i = 0; i < num_sequence; i++) { for (j = 0; j < index; j++) { for (k = 0; k < index2; k++) dist2 [j][k] = min (max_dist, len_word - 1) + 1; } len_sequence = strlen (sequence [i]); for (j = 0; j < len_sequence - len_tuple + 1; j++) { for (k = 0; k < num_strand; k++) { index3 = 0; for (l = 0; l < len_word; l++) { if (k == 0) ch = sequence [i][j + pos [l]]; else ch = complement (sequence [i][j + len_tuple - pos [l] - 1]); if (nucleotide (tolower (ch)) < 0) { index3 = -1; break; } index3 = NUM_NUCLEOTIDE * index3 + nucleotide (tolower (ch)); } if (index3 >= 0) dist2 [index3 / index2][index3 % index2] = 0; } } for (j = 0; j < min (max_dist, len_word - 1); j++) { for (k = 0; k < index; k++) { for (l = 0; l < index2; l++) { if (dist2 [k][l] == j) { index3 = 1; for (p = 0; p < len_word; p++) { for (q = 0; q < NUM_NUCLEOTIDE; q++) { index4 = ((k * index2 + l) & (~((NUM_NUCLEOTIDE - 1) * index3))) + q * index3; dist2 [index4 / index2][index4 % index2] = min (j + 1, dist2 [index4 / index2][index4 % index2]); } index3 *= NUM_NUCLEOTIDE; } } } } } for (j = 0; j < index; j++) { for (k = 0; k < index2; k++) { if (dist2 [j][k] <= min (max_dist, len_word - 1)) num_dist [dist2 [j][k]][j][k]++; } } } for (i = 0; i < index; i++) free (dist2 [i]); free (dist2); index3 = 0; compute_motif1 (max_dist, max_evalue, num_sequence, len_tuple, len_word, 0, choose, prob, pos, index2, num_dist, 0, &index3, motif); for (i = 0; i <= min (max_dist, len_word - 1); i++) { for (j = 0; j < index; j++) free (num_dist [i][j]); free (num_dist [i]); } free (num_dist); } } else { for (i = pos2; i <= len_tuple - len_word + len_word2; i++) { pos [len_word2] = i; compute_motif2 (max_dist, num_strand, max_evalue, num_sequence, sequence, len_tuple, len_word, len_word2 + 1, choose, prob, pos, i + 1, motif); } } } void compute_motif (int max_dist, int num_strand, int bias, long double max_evalue, int num_sequence, char **sequence, int len_tuple, int len_word, motif_node **motif) { int i, j, k, num_tuple, total_at, total_cg, len_sequence, *pos; long double **choose, *choose2, choose3, *prob, *prob2, *prob3, *prob4, *prob5, *prob6, ***prob7, prob8, prob9; num_tuple = 0; for (i = 0; i < num_sequence; i++) num_tuple += max (0, (int) (strlen (sequence [i])) - len_tuple + 1) * num_strand; if (bias) { total_at = 0; total_cg = 0; for (i = 0; i < num_sequence; i++) { len_sequence = strlen (sequence [i]); for (j = 0; j < len_sequence; j++) { if (nucleotide (tolower (sequence [i][j])) >= 0) { if ( (tolower (sequence [i][j]) == 'a') || (tolower (sequence [i][j]) == 't') ) total_at++; else total_cg++; } } } } else { total_at = 1; total_cg = 1; } choose = (long double **) malloc ((len_word + 1) * sizeof (long double *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for choose\n"); exit (-1); } for (i = 0; i <= len_word; i++) { choose [i] = (long double *) malloc ((min (max_dist, i) + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for choose\n"); exit (-1); } choose [i][0] = 1.0; for (j = 0; j < min (max_dist, i); j++) choose [i][j + 1] = 1.0 * (i - j) / (j + 1) * choose [i][j]; } choose2 = (long double *) malloc ((num_sequence + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for choose2\n"); exit (-1); } choose2 [0] = 1.0; for (i = 0; i < num_sequence; i++) choose2 [i + 1] = 1.0 * (num_sequence - i) / (i + 1) * choose2 [i]; choose3 = 1.0; for (i = 0; i < len_word; i++) { if (i < 2) choose3 *= 1.0 * NUM_NUCLEOTIDE; else choose3 *= 1.0 * (len_tuple - i) / (i - 1) * NUM_NUCLEOTIDE; } prob = (long double *) malloc ((min (max_dist, len_word - 1) + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob\n"); exit (-1); } prob [0] = 1.0; for (i = 0; i < min (max_dist, len_word - 1); i++) prob [i + 1] = (1.0 - 0.5 * total_at / (total_at + total_cg)) * prob [i]; prob2 = (long double *) malloc ((len_word + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob2\n"); exit (-1); } prob2 [0] = 1.0; for (i = 0; i < len_word; i++) prob2 [i + 1] = 0.5 * total_at / (total_at + total_cg) * prob2 [i]; prob3 = (long double *) malloc ((min (max_dist, len_word - 1) + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob3\n"); exit (-1); } prob3 [0] = 1.0; for (i = 0; i < min (max_dist, len_word - 1); i++) prob3 [i + 1] = (1.0 - 0.5 * total_cg / (total_at + total_cg)) * prob3 [i]; prob4 = (long double *) malloc ((len_word + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob4\n"); exit (-1); } prob4 [0] = 1.0; for (i = 0; i < len_word; i++) prob4 [i + 1] = 0.5 * total_cg / (total_at + total_cg) * prob4 [i]; prob5 = (long double *) malloc ((num_sequence + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob5\n"); exit (-1); } prob6 = (long double *) malloc ((num_sequence + 1) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob6\n"); exit (-1); } prob7 = (long double ***) malloc ((len_word + 1) * sizeof (long double **)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob7\n"); exit (-1); } for (i = 0; i <= len_word; i++) { prob7 [i] = (long double **) malloc ((min (max_dist, len_word - 1) + 1) * sizeof (long double *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob7\n"); exit (-1); } prob8 = 0.0; for (j = 0; j <= min (max_dist, len_word - 1); j++) { prob7 [i][j] = (long double *) malloc ((num_sequence + 2) * sizeof (long double)); if (errno == ENOMEM) { printf ("Error: Not enough memory for prob7\n"); exit (-1); } for (k = max (0, i + j - len_word); k <= min (i, j); k++) prob8 += choose [i][k] * choose [len_word - i][j - k] * prob [k] * prob2 [i - k] * prob3 [j - k] * prob4 [len_word - i - j + k]; if (num_sequence > 0) { prob9 = 1.0; for (k = 0; k < num_tuple / num_sequence; k++) prob9 *= 1.0 - prob8; } prob5 [0] = 1.0; for (k = 0; k < num_sequence; k++) prob5 [k + 1] = (1.0 - prob9) * prob5 [k]; prob6 [0] = 1.0; for (k = 0; k < num_sequence; k++) prob6 [k + 1] = prob9 * prob6 [k]; prob7 [i][j][num_sequence + 1] = 0.0; for (k = num_sequence; k >= 0; k--) prob7 [i][j][k] = choose2 [k] * prob5 [k] * prob6 [num_sequence - k] + prob7 [i][j][k + 1]; } } for (i = 0; i <= len_word; i++) free (choose [i]); free (choose); free (choose2); free (prob); free (prob2); free (prob3); free (prob4); free (prob5); free (prob6); pos = (int *) malloc (len_word * sizeof (int)); if (errno == ENOMEM) { printf ("Error: Not enough memory for pos\n"); exit (-1); } pos [0] = 0; pos [len_word - 1] = len_tuple - 1; compute_motif2 (max_dist, num_strand, max_evalue, num_sequence, sequence, len_tuple, len_word, 1, choose3, prob7, pos, 1, motif); for (i = 0; i <= len_word; i++) { for (j = 0; j <= min (max_dist, len_word - 1); j++) free (prob7 [i][j]); free (prob7 [i]); } free (prob7); free (pos); } int motif_compare (const void *motif, const void *motif2) { if ((*((motif_node **) motif)) -> evalue < (*((motif_node **) motif2)) -> evalue) return -1; if ((*((motif_node **) motif)) -> evalue > (*((motif_node **) motif2)) -> evalue) return 1; return 0; } void print_motif (int max_len_tuple, int num_strand, int num_sequence, char **sequence, char **sequence_name, motif_node *motif) { int i, j, k, l, m, num_motif, **freq, **paint, len_sequence, total_freq, found, dist, max_dist; char ch, *str; motif_node *current_motif, **motif2; num_motif = 0; current_motif = motif; while (current_motif != NULL) { num_motif++; current_motif = current_motif -> next; } motif2 = (motif_node **) malloc (num_motif * sizeof (motif_node *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for motif2\n"); exit (-1); } num_motif = 0; current_motif = motif; while (current_motif != NULL) { motif2 [num_motif] = current_motif; num_motif++; current_motif = current_motif -> next; } qsort (motif2, num_motif, sizeof (motif_node *), motif_compare); freq = (int **) malloc (max_len_tuple * sizeof (int *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for freq\n"); exit (-1); } for (i = 0; i < max_len_tuple; i++) { freq [i] = (int *) malloc (NUM_NUCLEOTIDE * sizeof (int)); if (errno == ENOMEM) { printf ("Error: Not enough memory for freq\n"); exit (-1); } } str = (char *) malloc (max_len_tuple * sizeof (char)); if (errno == ENOMEM) { printf ("Error: Not enough memory for str\n"); exit (-1); } paint = (int **) malloc (num_sequence * sizeof (int *)); if (errno == ENOMEM) { printf ("Error: Not enough memory for paint\n"); exit (-1); } for (i = 0; i < num_sequence; i++) { len_sequence = strlen (sequence [i]); paint [i] = (int *) malloc (len_sequence * sizeof (int)); if (errno == ENOMEM) { printf ("Error: Not enough memory for paint\n"); exit (-1); } for (j = 0; j < len_sequence; j++) paint [i][j] = 0; } for (i = 0; i < num_motif; i++) { for (j = 0; j < max_len_tuple; j++) { for (k = 0; k < NUM_NUCLEOTIDE; k++) freq [j][k] = 0; } total_freq = 0; found = 0; for (j = 0; j < num_sequence; j++) { len_sequence = strlen (sequence [j]); for (k = 0; k < len_sequence - motif2 [i] -> len_tuple + 1; k++) { for (l = 0; l < num_strand; l++) { dist = 0; for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (nucleotide (motif2 [i] -> str [m]) >= 0) { if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); if (nucleotide (tolower (ch)) < 0) { dist = motif2 [i] -> dist + 1; break; } if (tolower (ch) != motif2 [i] -> str [m]) { dist++; if (dist > motif2 [i] -> dist) break; } } } if (dist <= motif2 [i] -> dist) { for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (paint [j][k + m]) { found = 1; break; } if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); freq [m][nucleotide (tolower (ch))]++; } total_freq++; if (found) break; } } if (found) break; } if (found) break; } if (!found) { for (j = 0; j < max_len_tuple; j++) { str [j] = ' '; for (k = 0; k < NUM_NUCLEOTIDE; k++) { if (freq [j][k] > total_freq / 2) { str [j] = nucleotide_char [k]; break; } } } max_dist = 0; for (j = 0; j < num_sequence; j++) { len_sequence = strlen (sequence [j]); for (k = 0; k < len_sequence - motif2 [i] -> len_tuple + 1; k++) { for (l = 0; l < num_strand; l++) { dist = 0; for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (nucleotide (motif2 [i] -> str [m]) >= 0) { if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); if (nucleotide (tolower (ch)) < 0) { dist = motif2 [i] -> dist + 1; break; } if (tolower (ch) != motif2 [i] -> str [m]) { dist++; if (dist > motif2 [i] -> dist) break; } } } if (dist <= motif2 [i] -> dist) { dist = 0; for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (nucleotide (str [m]) >= 0) { if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); if (tolower (ch) != str [m]) dist++; } } max_dist = max (dist, max_dist); } } } } for (j = 0; j < num_sequence; j++) { len_sequence = strlen (sequence [j]); for (k = 0; k < len_sequence - motif2 [i] -> len_tuple + 1; k++) { for (l = 0; l < num_strand; l++) { dist = 0; for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (nucleotide (str [m]) >= 0) { if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); if (nucleotide (tolower (ch)) < 0) { dist = max_dist + 1; break; } if (tolower (ch) != str [m]) { dist++; if (dist > max_dist) break; } } } if (dist <= max_dist) { for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (paint [j][k + m]) { found = 1; break; } } if (found) break; } } if (found) break; } if (found) break; } } if (!found) { for (j = 0; j < num_sequence; j++) { len_sequence = strlen (sequence [j]); for (k = 0; k < len_sequence - motif2 [i] -> len_tuple + 1; k++) { for (l = 0; l < num_strand; l++) { dist = 0; for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (nucleotide (str [m]) >= 0) { if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); if (nucleotide (tolower (ch)) < 0) { dist = max_dist + 1; break; } if (tolower (ch) != str [m]) { dist++; if (dist > max_dist) break; } } } if (dist <= max_dist) { dist = 0; for (m = 0; m < motif2 [i] -> len_tuple; m++) { if (l == 0) ch = sequence [j][k + m]; else ch = complement (sequence [j][k + motif2 [i] -> len_tuple - m - 1]); printf ("%c", ch); if ( (nucleotide (motif2 [i] -> str [m]) >= 0) && (tolower (ch) != motif2 [i] -> str [m]) ) dist++; paint [j][k + m] = 1; } if (sequence_name [j][0] != ' ') printf (" "); printf ("%s/", sequence_name [j]); if (l > 0) printf ("-"); printf ("%d", k + 1); if (dist > motif2 [i] -> dist) printf ("+"); printf ("\n"); } } } } for (j = 0; j < motif2 [i] -> len_tuple; j++) { if (nucleotide (motif2 [i] -> str [j]) >= 0) printf ("%c", motif2 [i] -> str [j]); else printf ("-"); } printf (" %d %.2Le\n", motif2 [i] -> dist, motif2 [i] -> evalue); } } free (motif2); for (i = 0; i < max_len_tuple; i++) free (freq [i]); free (freq); free (str); for (i = 0; i < num_sequence; i++) free (paint [i]); free (paint); } main (int argc, char **argv) { int i, j, max_len_tuple, max_len_word, max_dist, num_strand, bias, num_sequence; long double max_evalue; char *str, **sequence, **sequence_name; motif_node *motif, *current_motif; if (argc > 7) { printf ("Usage: motifenumerator [max_len_tuple] [max_len_word] [max_dist] [num_strand] [bias] [max_evalue]\n"); exit (-1); } if (argc <= 1) max_len_tuple = MAX_LEN_TUPLE; else { max_len_tuple = atoi (argv [1]); if (max_len_tuple <= 0) max_len_tuple = MAX_LEN_TUPLE; } if (argc <= 2) max_len_word = MAX_LEN_WORD; else { max_len_word = atoi (argv [2]); if (max_len_word < 0) max_len_word = MAX_LEN_WORD; } if (argc <= 3) max_dist = MAX_DIST; else { max_dist = atoi (argv [3]); if (max_dist < 0) max_dist = MAX_DIST; } if (argc <= 4) num_strand = NUM_STRAND; else { num_strand = atoi (argv [4]); if ((num_strand < 1) || (num_strand > 2)) num_strand = NUM_STRAND; } if (argc <= 5) bias = BIAS; else { bias = atoi (argv [5]); if ((bias < 0) || (bias > 1)) bias = BIAS; } if (argc <= 6) max_evalue = MAX_EVALUE; else { max_evalue = atof (argv [6]); if ((max_evalue < 0.0) || (max_evalue > 1.0)) max_evalue = MAX_EVALUE; } str = fmakeword (stdin, EOF, NULL); sequence = read_fasta (str, &sequence_name, &num_sequence); free (str); motif = NULL; for (i = 0; i < ((max_len_word > 0) ? max_len_word : max_len_tuple); i++) { for (j = i; j < (((i > 0) && (max_len_word > 0)) ? max_len_tuple : i + 1); j++) compute_motif (max_dist, num_strand, bias, max_evalue, num_sequence, sequence, j + 1, i + 1, &motif); } print_motif (max_len_tuple, num_strand, num_sequence, sequence, sequence_name, motif); for (i = 0; i < num_sequence; i++) free (sequence [i]); free (sequence); for (i = 0; i < num_sequence; i++) free (sequence_name [i]); free (sequence_name); while (motif != NULL) { current_motif = motif; motif = motif -> next; free (current_motif -> str); free (current_motif); } }