I/O Operations: Pfade übertragen auf einen HTML-Parser - #141669 (Allgemeines zu Perl)

topeg
2010-10-03 23:47
User since
2006-07-10
2611 Artikel
BenutzerIn

user image
Ich habe den Code mal lauffähig gemacht. Schau ihn dir an und versuche ihn zu verstehen. Ich hoffe die Kommentare helfen
Code (perl): (dl )
#!/usr/bin/perl

use strict;            # alles muss definiert sein
use warnings;          # wenn etwas nicht so ganz richtig ist warnen
use diagnostics;       # wenn etwas nicht passt ist warnen
use File::Find::Rule;  # finde Dateien/Verzeichnisse anhand von Regeln
use HTML::TokeParser;  # parse HTML-Dateien zum leichten auslesen von Daten

# Array in der alle Schulen mit ihren Daten gespeichert werden sollen
my @schools;

# Das Verzeichnis im dem gesucht werden soll
my $search_dir='.'; # ist das aktuelle Arbeitsverzeichnis

# die Datei in die alles gespeichert werden soll
my $out_file='./output.xml';

# Suche nach bestimmten Dateinamen
my @files= File::Find::Rule->file()            # suche eine Datei
                ->name('einzelergebnis*.html') # die mit "einzelergebnis" (alles klein geschieben!) beginnt und mit ".html" endet
                ->in($search_dir);              # suche in dem Verzeichnis


#gehe alle gefundenen Dateien durch
for my $file (@files)
{
  # Ausgabe, damit man weiß waw passiert.
  print "Bearbeite nun datei: $file!\n";

  # Speichrort für die Schuldaten in dieser Datei
  my %school;

  # starte seine neue Parser-Instanz mit der Datei als Quelle
  my $p = HTML::TokeParser->new($file) or die "Can't open $file: ($!)";

  #solange ein Tag von Typ 'div' gefunden wird
  while (my $tag = $p->get_tag('div', '/html'))
  {
    # first move to the right div that contains the information
    last if $tag->[0] eq '/html';
    next unless exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'inhalt_large';

    $p->get_tag('h1');
    $school{'location'} = $p->get_text('/h1');

    while (my $tag = $p->get_tag('div'))
    {
      last if exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'fusszeile';

      # get the school name from the heading
      next unless exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'fm_linkeSpalte';
      $p->get_tag('h2');
      $school{'name'} = $p->get_text('/h2');

      # verify format for school type
      $tag = $p->get_tag('span');
      unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'schulart_text')
      {
        warn "unexpected format: parsing stopped";
        last;
      }

      $school{'type'} = $p->get_text('/span');

      # verify format for address
      $tag = $p->get_tag('p');
      unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'einzel_text')
      {
        warn "unexpected format: parsing stopped";
        last;
      }
      $school{'address'} = clean_address($p->get_text('/p'));

      # find the description
      $tag = $p->get_tag('p');
      $school{'description'} = $p->get_text('/p');
    }
  }

  # speichere eine refenz auf den Hash mit den Daten der aktuellen Schule im Array mit allen Schulen
  push(@schools,\%school);
}

# Ausgabe in eine Datei als einfaches "XML" formatiert:
open(my $fh, '>', $out_file) or die("Error open $out_file ($!)\n");
print $fh "<schools>\n";
for my $school (@schools)
{
  print $fh "  <school>\n";
  print $fh "    <name>$school->{name}</name>\n";
  print $fh "    <location>$school->{location}</location>\n";
  print $fh "    <type>$school->{type}<type>\n";
  print $fh "    <address>\n";
  for my $address (@{$school->{address}})
  {
    print $fh "      <entry>$address</entry>\n";
  }
  print $fh "    </address>\n";
  print $fh "    <description>$school->{description}</description>\n";
  print $fh "  </school>\n";
}
print $fh "</schools>\n";
close($fh);


##########################################################################

# Funktion um die Adressen von unnötigen Zeichen zu befreihen
# und als Array jede Zeile zurück zu liefern
sub clean_address
{
  my $text = shift;
  my @lines = split "\n", $text;
  for (@lines)
  {
    s/^s+//;
    s/s+$//;
  }
  return \@lines;
}
Last edited: 2010-10-03 23:50:31 +0200 (CEST)