1 2 3 4 5 6 7 8 9 10 11 12 13
sub read_record { my ($in) = @_; if (my $line = <$in>) { chomp $line; my ($head, $data) = split /\s*[.]\s+[+]\s+[.0]\s*/, $line, 2; return [split(/\s+/, $head // '')], $data; } else { return; } }
1 2 3 4 5 6 7 8 9
sub push_record { my ($record, $data) = @_; foreach my $item (split /\s*;\s*/, $data // '') { my ($key, $value) = split /\s*=\s*/, $item, 2; push @{$record->{$key}}, $value; @{$record->{$key}} = uniq @{$record->{$key}}; } }
1 2 3 4 5 6 7 8 9 10 11 12
my %info; while (my ($head, $data) = read_record($in)) { if ($head->[1] eq 'RefSeq' and $head->[2] eq 'gene') { my $id = join '/', @{$head}[0,3,4]; my $record = {}; push_record($record, $data); push_record($record, (read_record($in))[-1]); $info{$id} = $record; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
#!/usr/bin/perl # Task: Extract GeneID-Number and gene information use strict; use warnings; my $in; my $data; my @array; my $GeneID; my @BMB; my $flag = 0; my %hash; # 1) open the .gff Inputfile and while reading line by line split $data at each tab and put them in the @array open $in, '<', "Genomteil.gff" or die $!; while ($data = <$in>) { @array = split (/\t/, $data); if ($array[2] =~ /gene/) { #if you find the word 'gene' a textbloxk follows which contains some information I want to extract and put in an array) $flag = 1; # Set the flag, start of important data @BMB = ($array[3], $array[4], $array[6]); #the array will be used as values for my hash later } if ($array[8] =~ /.*;db_xref=GeneID:(\d+)\n/) { #if you find the word 'GeneID' extract the following number and put it in my hash (as key), then put the array in my hash $GeneID = $1; } if ($array[2] =~ /CDS/) { push (@BMB, $array[2]); #put more data in my array } elsif ($array[2] =~ /exon/) { push (@BMB, $array[2]); } $hash{$GeneID} = [ @BMB ]; if ($array [8]=~ /.*;exon_number=1/){ $flag = 0; } # Reset the flag. I am expecting a 'gene'-line next # end while ($data = <$in>) } close $in; foreach my $key (keys %hash) { foreach my $val (@{$hash{$key}}) { print "$key --> $val\n"; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#!/usr/bin/perl # Task: Extract GeneID-Number and gene information use strict; use warnings; my $in; my $data; my @array; my $array; my $GeneID; my @BMB; my $flag = 0; my %hash; my $hash; # 1) open the .gff Inputfile and while reading line by line split $dat +a at each tab and put them in the @array open $in, '<', "Genomteil.gff" or die $!; while ($data = <$in>) { @array = split (/\t/, $data); if ($flag == 0) { if ($array[2] =~ /gene/) { #if you find the word 'gene' a textbloxk follows which contains some information I want to extract and put in an array) $flag = 1; # Set the flag. We will be expecting a 'CDS' or 'exon'-line next @BMB = ($array[3], $array[4], $array[6]); #the array will be used as values for my hash later } ## end if ($array[2] =~ /gene/) if ($array[8] =~ /.*;db_xref=GeneID:(\d+)\n/) { #if you find the word 'GeneID' extract the following number and put it in my hash (as key), then put the array in my hash $GeneID = $1; } ## end if ($array[8] =~ /.*;db_xref=GeneID:(\d+)\n/) } elsif ($flag == 1) { if ($array[2] =~ /CDS/) { push (@BMB, $array[2]); #put more data in my array } elsif ($array[2] =~ /exon/) { push (@BMB, $array[2]); } @{$hash{$GeneID}} = @BMB; $flag = 0; # Reset the flag. We will be expecting a 'gene'-line next } } ## end while ($data = <$in>) close $in; my $BMB; while (($GeneID, $BMB) = each %hash) { print "$GeneID => $BMB[0]\n"; }
2011-08-22T10:22:30 Pauline25Code (perl): (dl )1 2 3 4my $BMB; while (($GeneID, $BMB) = each %hash) { print "$GeneID => $BMB[0]\n"; }
@BMB = ($array[3], $array[4], $array[6]);
@BMB = @array[3,4,6];
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
#!/usr/bin/perl # Task: Extract GeneID-Number and gene information and store them in a hash use strict; use warnings; my $in; my $data; my @array; my $GeneID; my @BMB; my $flag = 0; my %hash; my $gb; my $product; my $novalue = 0; # 1) open the .gff Inputfile and while reading line by line split $data at each tab and put them in the @array open $in, '<', "Genomteil.gff" or die $!; while ($data = <$in>) { @array = split (/\t/, $data); # 2) as long the flag == 0, search for the word 'gene' in the second element of @array. If you find it, set the flag ($flag=1) if ($flag==0){ if ($array[2] =~ /gene/) { $flag = 1; # 3) if the flag is set, put the three elements in my new array @BMB # now I have $BMB[0] = start point; $BMB[1] = end point; $BMB[2] = + or - @BMB = ($array[3], $array[4], $array[6]); } # 4) #if you find the word 'GeneID' extract the following number and store it in $GeneID if ($array[8] =~ /.*;db_xref=GeneID:(\d+)\n/) { $GeneID = $1; } # 5) if the flag is still set (==1), search for the word 'CDS' or 'exon' in $array[2] and put it in @BMB }elsif ($flag==1){ if ($array[2] =~ /CDS/) { push (@BMB, $array[2]); } elsif ($array[2] =~ /exon/) { push (@BMB, $array[2]); # 6) now I have $BMB [3] = CDS or exon # 7) if you find the word 'gbkey' (it's an exon!) in $array[8] store the following word (rRNA or tRNA) in $gb and add it to @BMB if ($array[8]=~ /gbkey=(\w{4})/){ $gb = $1; push (@BMB, $gb); # 8) if you don't find 'gbkey' (it's a CDS!) add $nonvalue to @BMB, which has the value 0 # now I have $BMB[4] = rRNA or tRNA or 0 }elsif ($array[8]!~/gbkey=(\w{4})/){ push (@BMB, $novalue); } # 9) if ypu find the word 'product=' followed by ';protein_' or by ';db_' extract the product name and add it to @BMB # now I have $BMB[5] = product name if ($array[8]=~ /product=(.*);protein_/){ $product = $1; push (@BMB, $product); } elsif ($array[8]=~ /product=(.*);db_/){ $product =$1; push (@BMB, $product); } # 10) create a hash, use GeneID as key and @BMB for the values $hash{$GeneID} = [ @BMB ]; # 11) if you find the term ';exon_number=1' reset the flag ($flaf=0) if ($array [8]=~ /.*;exon_number=1/){ $flag = 0; } } # 12) search the next 'gene' while flag==0 } close $in; # 13) print the keys with all their values foreach my $key (keys %hash) { foreach my $val (@{$hash{$key}}) { print "$key --> $val\n"; } }