#!/usr/bin/perl #Program to convert information in EMBL format to #the format of the MalariaDB .ace files format. ($#ARGV>=0) || die "usage: embl2ace [-D] [-o ] if -D is presented, the ace file created will have \"-D : id\" line proceeded to each object \": id\", as the \"-D\" flag of ACeDB; use - for standard input; the default output is on the screen; options must be given in the order shown above\n"; if ($ARGV[0] =~ /^-D$/) { $VAR_D = 1; shift } else { $VAR_D = 0 }; if ($ARGV[0] =~ /^-o$/) { shift; # set the filehandle for output as $ARGV[0] open(OUTPUT, ">$ARGV[0]"); select(OUTPUT); shift }; #Define EMBL_feature key names @key_names = ("TATA_signal","allele_seq","conflict","mat_peptide", "misc_binding","misc_signal","misc_feature","misc_recomb","modified_base", "mutation","old_sequence","polyA_site","polyA_signal","prim_binding", "prim_transcript","promoter","repeat_region","repeat_unit","satellite", "sig_peptide","variation","enhancer","protein_bind","stem_loop", "primer_bind","transit_peptide","misc_structure","precursor_RNA", "terminator","INTRON","EXON"); #Define sub sequence key names @RNA_keys =("CDS","mRNA","tRNA","rRNA","snRNA","scRNA","misc_RNA"); SCAN: while (<>) { next SCAN if /^\s*$/; # skip blank lines. next SCAN if /^XX/; # skip XX lines next SCAN if /^KW\s+\.\s*$/; # skip empty KW line next SCAN if /^OC\s+\.\s*$/; # skip empty OC line if (/^\/\//) # // lines { # output (last) entry details. &output_entry_details if ($VAR_id); next SCAN }; # Extract Sequence name, DNA/RNA, and Sequence length # from: ID $VAR_id standard; XNA; EST; $VAR_length BP. if (/^ID/) { # start of an entry, initialize variables. &variables_initialisation; # split; # does not work in Perl5 @ID_elements = split(';', $_); @ID_initials = split(' ', $ID_elements[0]); $VAR_id = $ID_initials[1]; $VAR_id .= ' (EMBL)'; $VAR_cDNA = "cDNA" if ($ID_elements[1] =~ /RNA/); # $1 refers to the part matching [0-9]* $VAR_length = $ID_elements[3] if ($_ =~ /([0-9]+) BP/); $VAR_length =~ s/[^0-9]//g; $VAR_int_length = &strtoint($VAR_length); next SCAN }; # Extract Accession number(s). if (/^AC\s+(.+)/) { chop; $_ =~s/^AC //; # Accession numbers are seperated by \s. $VAR_temp_accessions = $_; while (<>){ if (/^AC\s+(.+)/) { chop; $_ =~ s/^AC //; $VAR_temp_accessions .= " "; $VAR_temp_accessions .= $_; } else { last } } $VAR_temp_accessions =~ s/;//; if ($VAR_temp_accessions =~ /\s/){ @VAR_accession = split(" ", $VAR_temp_accessions); } else { $VAR_accession[0] = $VAR_temp_accessions; } #We now have an array, @VAR_accession, of Accession numbers. redo SCAN; }; # Extract keywords if (/^KW\s+(.+)/) { if ($VAR_keywords) { $VAR_keywords .= " $1" } else { $VAR_keywords = $1}; $VAR_keywords =~ s/\.\s*$//g; # remove trailing "." if any next SCAN }; if (/^DT ([^\s]*)\s*/) { $temp_temp_date = $1; if ($_ =~ /Created/i){ $VAR_entry_date = &EMBLdate2ACEdate($temp_temp_date); } else { $VAR_modification_date = &EMBLdate2ACEdate($temp_temp_date); } }; # Extract Sequence remark. if (/^DE\s+(.+)/) { chop; $_ =~s/^DE //; $VAR_description = $_; while (<>){ if (/^DE\s+(.+)/) { chop; $_ =~s/^DE //; $VAR_description .= " " . $_; } else { last } } redo SCAN; }; if (/^OS\s+(.+)/) { next SCAN }; if (/^OC\s+(.+)/) { next SCAN }; # Extract database remark. if (/^CC\s+(.+)/) { chop; $_ =~ s/^CC //; $VAR_db_remark = $_; while (<>){ if (/^CC\s+(.+)/) { chop; $_ =~s/^CC //; $VAR_db_remark .= " "; $VAR_db_remark .= $_; } else { last } } redo SCAN; }; # Extract EMBL features. if (/^FT\s{3}/ ) { # We grab the source information for organism, clone, # and strain names. if (/^FT\s{3}source/){ chop; $VAR_source_info[$VAR_source_index] = $_; $VAR_source_info[$VAR_source_index] =~ s/^FT\s{3}source\s*//g; while(<>){ if (/^FT\s{3}\s+/){ chop; $_ =~ s/^FT\s*//g; $_ =~ s/"/'/g; $VAR_source_info[$VAR_source_index] .= " "; $VAR_source_info[$VAR_source_index] .= $_; } else { last } } $VAR_source_index += 1; redo SCAN; } # Check for sub sequence foreach $key(@RNA_keys){ if (/^FT\s{3}$key/){ chop; $_ =~ s/^FT\s{3}$key\s*//g; $_ =~ s/"/'/g; $VAR_RNA{$key} .= $_; while(<>){ if (/^FT\s{3}\s+/){ chop; $_ =~ s/^FT\s*//g; $_ =~ s/"/'/g; $VAR_RNA{$key} .= "\""; $VAR_RNA{$key} .= " "; $VAR_RNA{$key} .= $_; } else { last } } $VAR_RNA{$key} .= "\"\""; # such that lines are seperated by " # and subsequences are seperated by "" redo SCAN; } } # Check for EMBL_feature foreach $key(@key_names){ if (/^FT\s{3}$key/){ chop; $_ =~ s/"/'/g; $_ =~ s/^FT\s{3}$key\s*//g; $VAR_EMBL_feature{$key} .= $_; while(<>){ if (/^FT\s{3}\s+/){ chop; $_ =~ s/^FT\s*//g; $_ =~ s/"/'/g; $VAR_EMBL_feature{$key} .= "\""; $VAR_EMBL_feature{$key} .= " "; $VAR_EMBL_feature{$key} .= $_; } else { last } } $VAR_EMBL_feature{$key} .= "\"\""; # such that lines are seperated by " # and features are seperated by "" redo SCAN; } } }; # Extract the DNA sequence. if (/^SQ\s+Sequence\s+(.+)/) { $VAR_sequence = ""; while (<>) { if (/^\s\s\s\s\s/){ $_ =~ s/\s//g; $_ =~ s/[0-9]//g; $VAR_sequence .= "\n"; $VAR_sequence .= $_; } else { last } $VAR_sequence =~ s/\*//g; } redo SCAN; }; # Extract the references. if (/^RN\s+/) { $_ =~ s/RN \[//; $_ =~ s/]\s*//; $VAR_ref_num += 1; $VAR_authors[$VAR_ref_num] = ""; $VAR_title[$VAR_ref_num] = ""; $VAR_journal[$VAR_ref_num] = ""; $VAR_medline[$VAR_ref_num] = ""; REFERENCE_BLOCK: while(<>) { next REFERENCE_BLOCK if (/^XX\s+/); next REFERENCE_BLOCK if (/^RP\s+/); next REFERENCE_BLOCK if (/^RC\s+/); if (/^RX MEDLINE;\s*([0-9]*)[^0-9]/) { # medline account $medline = $1; $VAR_medline[$VAR_ref_num] = $medline; next REFERENCE_BLOCK } if (/^RA\s+(.+)/) { # authors $_ =~ s/^RA //; $_ =~ s/,\s*\n$/,/; $_ =~ s/;\s*\n$//; $_ =~ s/\s*\n$//; $VAR_author[$VAR_ref_num] .= " $_"; next REFERENCE_BLOCK } if (/^RT\s+(.+)/) { # Title $_ =~ s/^RT //g; chop; $VAR_title[$VAR_ref_num] = $_; # continue RT line ... while(<>) { if (/^RT\s+(.+)/) { $_ =~ s/^RT //g; chop; $VAR_title[$VAR_ref_num] .= " $_"; } else { last } }; $VAR_title[$VAR_ref_num] =~ s/\"//g; # remove " $VAR_title[$VAR_ref_num] =~ s/\s{2,}/ /g; # single space $VAR_title[$VAR_ref_num] =~ s/;\s*$//g; # remove last ";". $VAR_title[$VAR_ref_num] =~ s/^The //; # remove "The" # ("WashU-Merck EST Project" = "The WashU-Merck EST Project") redo REFERENCE_BLOCK }; if (/^RL\s+(.+)/) { $_ =~ s/RL //; chop; $VAR_journal[$VAR_ref_num] = $_; while(<>) { if (/^RL\s+(.+)/) { $_ =~ s/^RL //g; chop; $VAR_journal[$VAR_ref_num] .= " $_"; } else { last } }; redo REFERENCE_BLOCK }; last REFERENCE_BLOCK }; # end of REFERENCE_BLOCK redo SCAN }; # end of RN blocks. }; #------------------------------------------------------------------- # SUBROUTINE: VARIABLES_INITIALISATION # # Before processing an EMBL entry, initialise variables. #------------------------------------------------------------------- sub variables_initialisation { $VAR_entry_date = ""; #Sequence entry date. $VAR_modification_date = ""; #Sequence modification date. $VAR_id = ""; # Sequence name. $VAR_cDNA = ""; # ie. RNA, not DNA. $VAR_length = ""; # Sequence length(char). $VAR_int_length = 0; # Sequence length(int). $VAR_keywords = ""; # Sequence keywords. $VAR_ref_num = 0; # Reference enumerator. $VAR_temp_accessions = ""; # Accession numbers seperated by \s. @VAR_author = (); # Authors seperated by ',' /reference. @VAR_journal = (); # Journal / reference. @VAR_title = (); # Title of Paper / reference # (= "" if Paper unpublished). @VAR_medline = (); # Medline ID / reference. @VAR_accession = (); # Accession numbers / reference. $VAR_description = ""; # Seqence Title. $VAR_db_remark = ""; # Seqence DB_remark. $VAR_source_index = 0; # Index for EMBL FT source. @VAR_source_info = (); # Source information / source. $VAR_keywords = ""; # Seqence Keyword. $VAR_gene = ""; # Sequence Locus. foreach $key(@RNA_keys){ $VAR_RNA{$key} = ""; # Sequence Subsequences. } foreach $key(@key_names){ $VAR_EMBL_feature{$key} = ""; # Sequence EMBL_features. } }; #------------------------------------------------------------------- # SUBROUTINE: OUTPUT_ENTRY_DETAILS # Note: Output detail information on current sequence entry. # Variables referred are global variables. #------------------------------------------------------------------- sub output_entry_details { local($i, $j); # First, print out main sequence object $VAR_id_num = $VAR_id; $VAR_id_num =~ s/ \(EMBL\)$//; print "\n-D Sequence : \"$VAR_id\"\n\n" if ($VAR_D); print "Sequence : \"$VAR_id\"\n"; print "DNA \"$VAR_id\" $VAR_length\n"; print "Title \"$VAR_description\"\n"; print "Entry_date $VAR_entry_date\n"; print "Modification_date $VAR_modification_date\n"; print "Database \"EMBL\" \"$VAR_id\" \"$VAR_accession[0]\"\n"; print "DB_remark \"$VAR_db_remark\"\n"; # Extract clone, strain, and/or organism name from Feature source. foreach $source_trait (@VAR_source_info){ if ($source_trait =~ /\/clone='([^']*)'/i){ $VAR_clone = $1; print "Clone \"$VAR_clone\"\n"; } if ($source_trait =~ /\/strain='([^']*)'/i){ $VAR_strain = $1; print "Strain \"$VAR_strain\"\n"; } if ($source_trait =~ /\/organism='([^']*)'/i){ $VAR_organism = $1; print "Organism \"$VAR_organism\"\n"; } } if ($VAR_keywords) { $VAR_keywords =~ s/\"/'/g; @VAR_keyword = split(/;\s*/, $VAR_keywords); for ($i = 0; $i <= $#VAR_keyword; $i++) { print "Keyword \"$VAR_keyword[$i]\"\n" }; } # Establish reference to Subsequence. foreach $key(@RNA_keys){ @ARR_RNA = split('""', $VAR_RNA{$key}); # Giving us an array of subsequences of a particulr type. $i = 1; foreach $temp_ARR_RNA (@ARR_RNA){ # For each subsequence find and print # subsequence name, from-base, and to-base. @Inner_ARR_RNA = split('"', $temp_ARR_RNA); $subseq = $Inner_ARR_RNA[0]; $subseq =~ s/[^0-9]/ /g; if ($subseq =~ /^\s*([0-9]+)\s+([0-9]*|\s*)\s+([0-9]+)\s*$/){ $first_val = $1; $second_val = $3; } print "Subsequence \"$VAR_id_num:$key.$i (EMBL)\" $first_val $second_val\n"; $i++; } } # Extract EMBL features. foreach $key(@key_names){ $temp_VAR_EMBL_feature = $VAR_EMBL_feature{$key}; @VAR_EMBL_unit = split('""', $temp_VAR_EMBL_feature); # Giving us an array of features of a particulr type. foreach $VAR_EMBL_text(@VAR_EMBL_unit){ @VAR_EMBL_info = split('"', $VAR_EMBL_text); # Establish reference to sequence bases. $VAR_EMBL_val = $VAR_EMBL_info[0]; $VAR_EMBL_val =~ s/[^0-9]/ /g; if ($VAR_EMBL_info[0] !~ /REPLACE/){ if ($VAR_EMBL_val =~ /^\s*([0-9]+)\s+([0-9]*\s+)*([0-9]+)\s*$/){ $VAR_EMBL_one = $1; $VAR_EMBL_two = $3; print "$key $VAR_EMBL_one $VAR_EMBL_two "; $VAR_EMBL_tag = "$key $VAR_EMBL_one $VAR_EMBL_two "; } else { print "$key 1 $VAR_length "; $VAR_EMBL_tag = "$key 1 $VAR_length "; } } else { print "$key 1 $VAR_length "; $VAR_EMBL_tag = "$key 1 $VAR_length "; } $location = $VAR_EMBL_info[0]; $note = ""; $product = ""; $text = ""; $i = 1; # Find Sequence EMBL_feature #EMBL_info. while ($i <= $#VAR_EMBL_info){ if ($VAR_EMBL_info[$i] =~ /^\s*\/gene='([^']*)'/i){ $VAR_gene = $1; $i++; } elsif ($VAR_EMBL_info[$i] =~ /^\s*\/note/i){ $note = $VAR_EMBL_info[$i]; if ($VAR_EMBL_info[$i] !~ /[^']*'[^']*'[^']*/){ while ($VAR_EMBL_info[++$i] !~ /[^']*'[^']*/){ $note .= $VAR_EMBL_info[$i]; } $note .= $VAR_EMBL_info[$i]; } $i++; } else { if ($VAR_EMBL_info[$i] =~ /^\s*\/product/i){ $product = $VAR_EMBL_info[$i]; if ($VAR_EMBL_info[$i] !~ /[^']*'[^']*'[^']*/){ while ($VAR_EMBL_info[++$i] !~ /[^']*'[^']*/){ $product .= $VAR_EMBL_info[$i]; } $product .= $VAR_EMBL_info[$i]; } $i++; } else { $text .= $VAR_EMBL_info[$i]; $i++; } } } $text =~ s/'//g; $text =~ s/^\s//g; print " \"$text\" Location \"$location\"\n"; if ($note ne ""){ $note =~ s/'//g; $note =~ s/^\s\/note=//i; print "$VAR_EMBL_tag \"$text\" Note \"$note\"\n"; } if ($product ne ""){ $product =~ s/'//g; $product =~ s/^\s\/product=//i; print "$VAR_EMBL_tag \"$text\" Product \"$product\"\n"; } if ($VAR_gene ne ""){ print "Locus \"$VAR_gene\"\n"; $VAR_gene = ""; } } } print "$VAR_cDNA\n" if ($VAR_cDNA); # Sequence cDNA if RNA. # if no title for reference, reference is placed in # Sequence Origin tag. # Otherwise, we reference to a published article. for ($i = 1; $i <= $VAR_ref_num ; $i++){ if ($VAR_title[$i] =~ /^\s*$/){ $VAR_author[$i] =~ s/^\s*//; @VAR_author_array = split(", ", $VAR_author[$i]); foreach $VAR_auth(@VAR_author_array) { print "From_Author \"$VAR_auth\"\n"; } if ($VAR_journal[$i] !~ /^UNPUBLISHED/i){ $VAR_lab = $VAR_journal[$i]; $VAR_lab =~ s/^[^,]*, //; print "From_Laboratory \"$VAR_lab\"\n"; $VAR_ref_date = $VAR_journal[$i]; $VAR_ref_date =~ s/^SUBMITTED \(//; $VAR_ref_date =~ s/\).*//; # print "Date $VAR_ref_date\n"; } } else { if ($VAR_journal[$i] !~ /^UNPUBLISHED/i){ $VAR_reference = $VAR_journal[$i]; $VAR_reference =~ s/\s/_/g; print "Reference \"$VAR_reference\"\n" } } }; print "\n\n\n\n"; # Print out DNA sequence of main sequence print "DNA : \"$VAR_id\""; print $VAR_sequence , "\n\n\n\n"; # Print out sub sequence object. foreach $key(@RNA_keys){ @ARR_RNA = split('""', $VAR_RNA{$key}); $i = 1; foreach $temp_ARR_RNA (@ARR_RNA){ @Inner_ARR_RNA = split('"', $temp_ARR_RNA); $subseq = $Inner_ARR_RNA[0]; $subseq =~ s/[^0-9]/ /g; print "Sequence : \"$VAR_id_num:$key.$i (EMBL)\"\n"; foreach $source_trait (@VAR_source_info){ if ($source_trait =~ /\/strain='([^']*)'/i){ $VAR_strain = $1; print "Strain \"$VAR_strain\"\n"; } if ($source_trait =~ /\/organism='([^']*)'/i){ $VAR_organism = $1; print "Organism \"$VAR_organism\"\n"; } } print "Source \"$VAR_id\"\n"; if ($subseq =~ /^\s*([0-9]*)\s+([0-9]*)/){ $first_val = $1; $second_val = $2; $first_int = &strtoint($first_val); $second_int = &strtoint($second_val); } $final_int = $second_int - $first_int + 1; print "Source_Exons 1 " , $final_int , "\n"; print "$key\n"; ### if ($key eq "CDS"){ print "Remark \""; $Inner_ARR_RNA[0] =~ s/^\s*//g; print $Inner_ARR_RNA[0]; $j = 1; while ($j <= $#Inner_ARR_RNA){ if ($Inner_ARR_RNA[$j] =~ /\/gene='([^']*)'/i){ $VAR_gene = $1; $j++; } elsif (($Inner_ARR_RNA[$j] =~ /\/product='/i) && ($key eq "CDS")){ $VAR_protein = $Inner_ARR_RNA[$j]; $VAR_protein =~ s/\s*\/product='//i; $VAR_protein =~ s/'\s*$//; $j++; } elsif ($Inner_ARR_RNA[$j] =~ /\/translation='/){ $exp_patt = "\"$VAR_id_num:$key.$i (EMBL)\""; $Inner_ARR_RNA[$j] =~ s/^\s*//g; $pattern = $Inner_ARR_RNA[$j] . "\n"; $pattern =~ s/\/translation='//; $pattern =~ s/\n/ \\\n/; if ($Inner_ARR_RNA[$j] !~ /'\s*\n/){ $j++; while (($Inner_ARR_RNA[$j] !~ /'\s*$/) && ($j <= $#Inner_ARR_RNA)){ $Inner_ARR_RNA[$j] =~ s/^\s*//g; $pattern .= $Inner_ARR_RNA[$j]; $pattern .= " \\\n"; $j++; } $Inner_ARR_RNA[$j] =~ s/^\s*//g; $Inner_ARR_RNA[$j] =~ s/'//; $pattern .= $Inner_ARR_RNA[$j]; $j++; } } else { $Inner_ARR_RNA[$j] =~ s/^\s*//g; print " \\\n" , $Inner_ARR_RNA[$j]; $j++; } } print "\"\n"; if ($VAR_gene ne ""){ print "Locus \"$VAR_gene\"\n"; $VAR_gene = ""; } if ($VAR_protein ne ""){ print "Corresponding_protein \"$VAR_protein\"\n"; } if ($exp_patt ne ""){ print "Expr_pattern $exp_patt\n"; } print "\n\n\n"; if ($exp_patt ne ""){ print "Expr_pattern : $exp_patt\n"; print "Sequence $exp_patt\n"; if ($VAR_protein ne ""){ print "Protein \"$VAR_protein\"\n"; } print "Pattern \"$pattern\"\n"; print "\n\n\n"; } if ($VAR_protein ne ""){ print "Protein : \"$VAR_protein\"\n"; print "Corresponding_DNA $exp_patt\n"; print "Expr_pattern $exp_patt\n"; print "\n\n\n"; } $VAR_protein = ""; $exp_patt = ""; $pattern = ""; $i++; } } # Print out instance of Paper for ($i = 1; $i <= $VAR_ref_num ; $i++){ if (($VAR_title[$i] !~ /^\s*$/) && ($VAR_journal[$i] !~ /^UNPUBLISHED/)){ $VAR_journ_name = ""; $VAR_volume = ""; $VAR_page = ""; $VAR_year = ""; $VAR_reference = $VAR_journal[$i]; $VAR_reference =~ s/\s/_/g; print "Paper : \"$VAR_reference\"\n"; print "Title \"$VAR_title[$i]\"\n"; print "Medline_ID $VAR_medline\n"; if ($VAR_journal[$i] =~ /([^0-9]*)([0-9]*|[0-9]*\s*\([0-9]*\)\s*),([^\(]*)\(([0-9]*)\)/){ $temp_journ_name = $1; $temp_volume= $2; $temp_page = $3; $temp_year = $4; $temp_journ_name =~ s/\s*$//; $VAR_journ_name = "Journal \"$temp_journ_name\"\n"; $temp_volume =~ s/\s//g; $VAR_volume = "Volume $temp_volume\n"; $temp_page =~ s/\s//g; $VAR_page = "Page $temp_page\n"; $VAR_year = "Year $temp_year\n"; } else { if ($VAR_journal[$i] =~ /([^\(]*)\(([0-9]*)\)\s*IN PRESS\s*$/){ $temp_journ_name = $1; $temp_year = $2; $temp_journ_name =~ s/\s*$//; $VAR_journ_name = "Journal \"$temp_journ_name\"\n"; $VAR_year = "Year $temp_year\n"; } else { $VAR_journ_name = "Journal \"$VAR_journal[$i]\"\n"; } } print $VAR_journ_name , $VAR_volume , $VAR_page , $VAR_year; $VAR_author[$i] =~ s/^\s*//; @VAR_author_array = split(", ", $VAR_author[$i]); foreach $VAR_auth(@VAR_author_array) { print "Author \"$VAR_auth\"\n"; } print "\n\n\n\n"; } } }; #------------------------------------------------------------------- # SUBROUTINE: EMBL DATE FORMAT TO ACE DATETYPE FORMAT # # Used to convert dates #------------------------------------------------------------------- sub EMBLdate2ACEdate{ local(@temp_date, @VAR_date); @temp_date = (); @VAR_date = (); @temp_date = split ("-", $_); $VAR_date[0] = $temp_date[2]; $VAR_date[2] = $temp_date[0]; if ($temp_date[1] eq "JAN") { $VAR_date[1] = "01"; } if ($temp_date[1] eq "FEB") { $VAR_date[1] = "02"; } if ($temp_date[1] eq "MAR") { $VAR_date[1] = "03"; } if ($temp_date[1] eq "APR") { $VAR_date[1] = "04"; } if ($temp_date[1] eq "MAY") { $VAR_date[1] = "05"; } if ($temp_date[1] eq "JUN") { $VAR_date[1] = "06"; } if ($temp_date[1] eq "JUL") { $VAR_date[1] = "07"; } if ($temp_date[1] eq "AUG") { $VAR_date[1] = "08"; } if ($temp_date[1] eq "SEP") { $VAR_date[1] = "09"; } if ($temp_date[1] eq "OCT") { $VAR_date[1] = "10"; } if ($temp_date[1] eq "NOV") { $VAR_date[1] = "11"; } if ($temp_date[1] eq "DEC") { $VAR_date[1] = "12"; } $temp_date = join("-", @VAR_date); return $temp_date; } #------------------------------------------------------------------- # SUBROUTINE: STRING TO INTEGER # # An auxillary function #------------------------------------------------------------------- sub strtoint{ local($num, $i); @digits = split("" , $_[0]); $num = 0; for ($i = 0; $i <= $#digits; $i++){ $num += (10**($#digits - $i)) * (ord($digits[$i]) - 48); } return $num; }