#!/usr/bin/env perl use 5.012; use warnings; use List::MoreUtils qw/uniq/; use Data::Dumper; sub read_record { my ($in) = @_; if (my $line = <$in>) { chomp $line; my ($head, $data) = split /\s*[.]\s+[+]\s+[.0]\s*/, $line, 2; return [split(/\s+/, $head // '')], $data; } else { return; } } sub push_record { my ($record, $data) = @_; foreach my $item (split /\s*;\s*/, $data // '') { my ($key, $value) = split /\s*=\s*/, $item, 2; push @{$record->{$key}}, $value; @{$record->{$key}} = uniq @{$record->{$key}}; } } my $in = \*DATA; # TODO: open real data file or read from ARGV or STDIN my %info; while (my ($head, $data) = read_record($in)) { if ($head->[1] eq 'RefSeq' and $head->[2] eq 'gene') { my $id = join '/', @{$head}[0,3,4]; my $record = {}; push_record($record, $data); push_record($record, (read_record($in))[-1]); $info{$id} = $record; } } print Dumper \%info; # TODO: output or store in the format that you actually need __DATA__ NC_014171.1 RefSeq gene 11341 14260 . + . ID=NC_014171.1:rrl_1;locus_tag=BMB171_C5090;db_xref=GeneID:9190897 NC_014171.1 RefSeq exon 11341 14260 . + . ID=NC_014171.1:rrl_1:unknown_transcript_1;Parent=NC_014171.1:rrl_1;gbkey=rRNA;locus_tag=BMB171_C5090;product=23S ribosomal RNA;db_xref=GeneID:9190897;exon_number=1 foo bar garbage NC_014171.1 RefSeq gene 14311 14425 . + . ID=NC_014171.1:rrs_1;locus_tag=BMB171_C5091;db_xref=GeneID:9190898 NC_014171.1 RefSeq exon 14311 14425 . + . foo bar garbage . + . more=garbage foo bar garbage foo bar garbage NC_014171.1 RefSeq gene 18622 19509 . + . locus_tag=BMB171_C0010;db_xref=GeneID:9191051 NC_014171.1 RefSeq CDS 18622 19506 . + 0 locus_tag=BMB171_C0010;transl_table=11;product=pyridoxine biosynthesis protein;protein_id=YP_003662548.1;db_xref=GI:296500848;db_xref=GeneID:9191051;exon_number=1 NC_014171.1 RefSeq gene 22134 22526 . + . locus_tag=BMB171_C0013;db_xref=GeneID:9190939 NC_014171.1 RefSeq CDS 22134 22523 . + 0 locus_tag=BMB171_C0013;transl_table=11;product=hypothetical protein;protein_id=YP_003662551.1;db_xref=GI:296500851;db_xref=GeneID:9190939;exon_number=1