Reguläre Ausdrücke suchen (gelöst) - #157664 (Allgemeines zu Perl)

topeg
2012-04-19 16:01
User since
2006-07-10
2611 Artikel
BenutzerIn

user image
Also wie es zuerst war. Entscheide dich mal.
more (46.2kb):
Code (perl): (dl )
#!/usr/bin/perl
use strict;
use warnings;

my $dir='.';
my @motives = ("PO", "OHO", 'G[A-Z]{2}G');
my $out_dir='out';
my $sum_out='out/zusammenfassung.csv';

my %anzahl;

# Ordner öffnen und durchgehen
opendir(my $dh, $dir) or die("Error open $dir,($!)\n");
while(my $file=readdir($dh))
{
  # Pfad machen.
  my $path="$dir/$file";

  # testen ob es eine Datei ist
  next unless(-f $path);
  next if($path!~/\.txt$/);

  print "Analyse File $file\n";
  my @found=analyse_file($path,\@motives);

  for my $e (@found)
  {
    my ($motive1,$motive2)= sort @$e[0,1];

    # für die Zusammenfassung
    $anzahl{join('-!-', @$e[0,1])}++;

    # Ausgabe in Konsole
    #printf ( qq("%s" and "%s" found in Line %u and Line %u\n),@$e );

    # Ausgabe in Datei
    my $outpath="$out_dir/${motive1}__$motive2.txt";
    if(open(my $fh, '>>', $outpath))
    { printf $fh ("%s + %s found in Line %u and Line %u + %s\n", @$e[4,5,2,3],$file); }
    else
    { warn("Error open $outpath ($!)"); }
  }
}
closedir($dh);

# Zusammenfassung
#printf ("%s+%s %u\n",split(/-!-/,$_),$anzahl{$_}) for (sort keys(%anzahl));

# Ziel:
# eine Matrix der Gezählten Kombinationen
# der in den Dateien enthaltenen Ausdrücken
save_summary($sum_out,\%anzahl);

########################################################################

sub save_summary
{
  my $file=shift;
  my %anzahl=%{shift()};

  # Liste aller gefundenen Ausdrücke
  # und die Postion, die sie in der Matrix einnehmen sollen
  my %positions;
  my @muster;
  for my $pair (sort keys(%anzahl))
  {
    for(split(/-!-/,$pair))
    {
      # wenn der Ausdruck schon eine Postion hat, dann überspringen
      next if exists $positions{$_};

      # Dem Ausdruck eine Position in der zukünftigen Matrix zuweisen
      push(@muster,$_);
      $positions{$_}=$#muster;
    }
  }

  # die Matrix erstellen
  my @matrix;
  for my $pair (keys(%anzahl))
  {
    my ($m1,$m2)=split(/-!-/,$pair);
    my $count=$anzahl{$pair};

    # die Position in der Matrix ermitteln
    my $pos1=$positions{$m1};
    my $pos2=$positions{$m2};

    # Wert in die Matrix schreiben
    $matrix[$pos1][$pos2]+=$count;
  }


  # die Ausgabe als CSV:
  # Einträge mit ";" getrennt, Zeilenenden sind "\n";
  open(my $fh, '>', $file) or die("ERROR open $file ($!)\n");
  # Zeilen/Spalten beschreiben:
  print $fh 'Muster;',join(';',@muster)."\n";

  # matrix durchgehen:
  for my $pos1 (0..$#muster)
  {
    # Wenn eine Kombination nicht existiert, den Wert auf 0 setzen
    for my $pos2 (0..$#muster)
    { $matrix[$pos1][$pos2]=0 unless($matrix[$pos1][$pos2]); }
    # Muster angeben:
    print $fh $muster[$pos1].";";

    # die Ausgabe erzeugen
    print $fh join(';',@{$matrix[$pos1]})."\n";
  }
  close($fh);
}

sub analyse_file
{
  my $file=shift;
  my $motives=shift; # Referenz auf Array

  #die gesamte Datei einlesen
  my $content=read_file($file);
  unless($content)
  {
    warn("File empty!");
    return;
  }


  # eine Liste mit den Positionen der Zeileenden erstellen:
  my @line_ends=find_lineends($content);

  # alle Suchbegriffe durchgehen und sich alle Positionen merken.
  my %match;
  for my $motive (@$motives)
  {
    pos($content)=0;
    while($content=~/($motive)/g)
    {
      my $found=$1;
      # zur aktuellen Postion im String wird die Zeile Bestimmt
      my $line=calculate_line(\@line_ends,pos($content));

      # es wird ein Hash of Arrays erstellt
      push(@{$match{$motive}},{match=>$found, line=>$line});
    }
  }


  # Auswertung
  my @found;
  my @keys=keys(%match);

  # jedes Fund mit jedem anderen Kombinieren
  # doppelte vermeiden
  for my $p1 (0..$#keys)
  {
    my $motive1=$keys[$p1];

    for my $p2 ($p1..$#keys)
    {
      my $motive2=$keys[$p2];

      next if($motive1 eq $motive2 and @{$match{$motive1}}<2);

      if($motive1 eq $motive2)
      {
        my @list=@{$match{$motive1}};
        for my $pos1 (0 .. $#list-1)
        {
          for my $pos2 ($pos1+1 .. $#list)
          {
            push(@found,[
                $motive1,              $motive1,
                $list[$pos1]->{line},  $list[$pos2]->{line},
                $list[$pos1]->{match}, $list[$pos2]->{match}
              ]);
          }
        }
      }
      else
      {
        for my $ref1 (@{$match{$motive1}})
        {
          for my $ref2 (@{$match{$motive2}})
          {
            push(@found,[
                $motive1,       $motive2,
                $ref1->{line},  $ref2->{line},
                $ref1->{match}, $ref2->{match}
              ]);
          }
        }
      }
    }
  }
  return @found;
}

sub read_file
{
  my $file=shift;
  die("ERROR open $file ($!)\n") unless( open(my $handle, '<', $file) );
  #zeilenende auf undef setzen
  local $/=undef;
  # alles einlesen
  return <$handle>;
}

sub calculate_line
{
  my $endings=shift;
  my $pos=shift;
  return 0 if($pos<0);
  return $endings->[-1] if($pos > $endings->[-1]);
  for my $l (1..$#$endings)
  { return $l if($pos >= $endings->[$l-1] && $endings->[$l] >= $pos); }
  return -1;
}

sub find_lineends
{
  my $content=shift;
  my @list=(0);
  my $pos=0;
  while(( my $p=index($content,"\n",$pos) )>-1)
  {
    push(@list,$p);
    $pos=$p+1;
    last if($pos>=length($content));
  }
  return @list;
}