Leser: 28
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
#!/usr/bin/perl use strict; use warnings; use HTML::TokeParser; my $file = 'school.html'; my $p = HTML::TokeParser->new($file) or die "Can't open: $!"; my %school; while (my $tag = $p->get_tag('div', '/html')) { # first move to the right div that contains the information last if $tag->[0] eq '/html'; next unless exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'inhalt_large'; $p->get_tag('h1'); $school{'location'} = $p->get_text('/h1'); while (my $tag = $p->get_tag('div')) { last if exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'fusszeile'; # get the school name from the heading next unless exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'fm_linkeSpalte'; $p->get_tag('h2'); $school{'name'} = $p->get_text('/h2'); # verify format for school type $tag = $p->get_tag('span'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'schulart_text') { warn "unexpected format: parsing stopped"; last; } $school{'type'} = $p->get_text('/span'); # verify format for address $tag = $p->get_tag('p'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'einzel_text') { warn "unexpected format: parsing stopped"; last; } $school{'address'} = clean_address($p->get_text('/p')); # find the description $tag = $p->get_tag('p'); $school{'description'} = $p->get_text('/p'); } } print qq/$school{'name'}\n/; print qq/$school{'location'}\n/; print qq/$school{'type'}\n/; foreach (@{$school{'address'}}) { print "$_\n"; } print qq/\nDescription: $school{'description'}\n/; sub clean_address { my $text = shift; my @lines = split "\n", $text; foreach (@lines) { s/^\s+//; s/\s+$//; } return \@lines; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/usr/perl drinneliegt: #!/usr/bin/perl use strict; use warnings; use diagnostics; use File::Find::Rule; my @files = File::Find::Rule->file() ->name('einzelergebnis*.html') ->in( '.' ); foreach my $file(@files) { print $file, "\n"; }
Quotehtmlfiles/einzelergebnis80b5.html
htmlfiles/einzelergebnisa0ef.html
htmlfiles/einzelergebnis1b42.html
htmlfiles/einzelergebnis5960.html
htmlfiles/einzelergebnise523.html
htmlfiles/einzelergebnis2c7e.html
htmlfiles/einzelergebnisdf57.html
htmlfiles/einzelergebnis2b53-2.html
htmlfiles/einzelergebnisb1c0-2.html
....und 22 Tausend weitere Zeilen ... ;-)
2010-10-03T11:29:57 lin> my @html_files = File::Find::Rule->file->name( '*.html.files' )->in( $
> +html_dir );
Was macht das plus Zeichen hier?
1 2 3 4 5 6 7 8 9
PHP Code: #!/usr/bin/perl use strict; use warnings; use diagnostics; use File::Find::Rule; my @files = File::Find::Rule->file() ->name('einzelergebnis*.html') ->in('.');
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
#!/usr/bin/perl use strict; use warnings; use diagnostics; use File::Find::Rule; use HTML::TokeParser; #my $file = 'school.html' my@files= File::Find::Rule->file() ->name('einzelergebnis*.html') ->in(*'.'*); my $p = HTML::TokeParser->new($file) or die "Can't open: $!"; my %school; while (my $tag = $p->get_tag('div', '/html')) { # first move to the right div that contains the information last if $tag->[0] eq '/html'; next unless exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'inhalt_large'; $p->get_tag('h1'); $school{'location'} = $p->get_text('/h1'); while (my $tag = $p->get_tag('div')) { last if exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'fusszeile'; # get the school name from the heading next unless exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'fm_linkeSpalte'; $p->get_tag('h2'); $school{'name'} = $p->get_text('/h2'); # verify format for school type $tag = $p->get_tag('span'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'schulart_text') { warn "unexpected format: parsing stopped"; last; } $school{'type'} = $p->get_text('/span'); # verify format for address $tag = $p->get_tag('p'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'einzel_text') { warn "unexpected format: parsing stopped"; last; } $school{'address'} = clean_address($p->get_text('/p')); # find the description $tag = $p->get_tag('p'); $school{'description'} = $p->get_text('/p'); } } print qq/$school{'name'}n/; print qq/$school{'location'}n/; print qq/$school{'type'}n/; foreach (@{$school{'address'}}) { print "$_\n"; } print qq/nDescription: $school{'description'}n/; sub clean_address { my $text = shift; my @lines = split "\n", $text; foreach (@lines) { s/^s+//; s/s+$//; } return @lines; }
Quotesuse-linux:/usr/perl # perl perl_script_four.pl
Global symbol "$file" requires explicit package name at perl_script_four.pl line 15.
Execution of perl_script_four.pl aborted due to compilation errors (#1)
(F) You've said "use strict" or "use strict vars", which indicates
that all variables must either be lexically scoped (using "my" or "state"),
declared beforehand using "our", or explicitly qualified to say
which package the global variable is in (using "::").
Uncaught exception from user code:
Global symbol "$file" requires explicit package name at perl_script_four.pl line 15.
Execution of perl_script_four.pl aborted due to compilation errors.
at perl_script_four.pl line 73
suse-linux:/usr/perl #
my $p = HTML::TokeParser->new('einzelergebnis*.html') or die "Can't open: $!";
Quotesuse-linux:/usr/perl # perl perl_script_four.pl
Uncaught exception from user code:
Can't open: No such file or directory at perl_script_four.pl line 15.
at perl_script_four.pl line 15
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
#!/usr/bin/perl use strict; # alles muss definiert sein use warnings; # wenn etwas nicht so ganz richtig ist warnen use diagnostics; # wenn etwas nicht passt ist warnen use File::Find::Rule; # finde Dateien/Verzeichnisse anhand von Regeln use HTML::TokeParser; # parse HTML-Dateien zum leichten auslesen von Daten # Array in der alle Schulen mit ihren Daten gespeichert werden sollen my @schools; # Das Verzeichnis im dem gesucht werden soll my $search_dir='.'; # ist das aktuelle Arbeitsverzeichnis # die Datei in die alles gespeichert werden soll my $out_file='./output.xml'; # Suche nach bestimmten Dateinamen my @files= File::Find::Rule->file() # suche eine Datei ->name('einzelergebnis*.html') # die mit "einzelergebnis" (alles klein geschieben!) beginnt und mit ".html" endet ->in($search_dir); # suche in dem Verzeichnis #gehe alle gefundenen Dateien durch for my $file (@files) { # Ausgabe, damit man weiß waw passiert. print "Bearbeite nun datei: $file!\n"; # Speichrort für die Schuldaten in dieser Datei my %school; # starte seine neue Parser-Instanz mit der Datei als Quelle my $p = HTML::TokeParser->new($file) or die "Can't open $file: ($!)"; #solange ein Tag von Typ 'div' gefunden wird while (my $tag = $p->get_tag('div', '/html')) { # first move to the right div that contains the information last if $tag->[0] eq '/html'; next unless exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'inhalt_large'; $p->get_tag('h1'); $school{'location'} = $p->get_text('/h1'); while (my $tag = $p->get_tag('div')) { last if exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'fusszeile'; # get the school name from the heading next unless exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'fm_linkeSpalte'; $p->get_tag('h2'); $school{'name'} = $p->get_text('/h2'); # verify format for school type $tag = $p->get_tag('span'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'schulart_text') { warn "unexpected format: parsing stopped"; last; } $school{'type'} = $p->get_text('/span'); # verify format for address $tag = $p->get_tag('p'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'einzel_text') { warn "unexpected format: parsing stopped"; last; } $school{'address'} = clean_address($p->get_text('/p')); # find the description $tag = $p->get_tag('p'); $school{'description'} = $p->get_text('/p'); } } # speichere eine refenz auf den Hash mit den Daten der aktuellen Schule im Array mit allen Schulen push(@schools,\%school); } # Ausgabe in eine Datei als einfaches "XML" formatiert: open(my $fh, '>', $out_file) or die("Error open $out_file ($!)\n"); print $fh "<schools>\n"; for my $school (@schools) { print $fh " <school>\n"; print $fh " <name>$school->{name}</name>\n"; print $fh " <location>$school->{location}</location>\n"; print $fh " <type>$school->{type}<type>\n"; print $fh " <address>\n"; for my $address (@{$school->{address}}) { print $fh " <entry>$address</entry>\n"; } print $fh " </address>\n"; print $fh " <description>$school->{description}</description>\n"; print $fh " </school>\n"; } print $fh "</schools>\n"; close($fh); ########################################################################## # Funktion um die Adressen von unnötigen Zeichen zu befreihen # und als Array jede Zeile zurück zu liefern sub clean_address { my $text = shift; my @lines = split "\n", $text; for (@lines) { s/^s+//; s/s+$//; } return \@lines; }
QuoteBearbeite nun datei: htmlfiles/einzelergebnis2592.html!
Bearbeite nun datei: htmlfiles/einzelergebnisdee0.html!
Bearbeite nun datei: htmlfiles/einzelergebnis987b-2.html!
Bearbeite nun datei: htmlfiles/einzelergebnise20b.html!
unexpected format: parsing stopped at perl_script_six.pl line 59.
Bearbeite nun datei: htmlfiles/einzelergebnised05.html!
Bearbeite nun datei: htmlfiles/einzelergebnisdec3.html!
QuoteUse of uninitialized value in concatenation (.) or string at perl_script_six.pl
line 92 (#1)
(W uninitialized) An undefined value was used as if it were already
defined. It was interpreted as a "" or a 0, but maybe it was a mistake.
To suppress this warning assign a defined value to your variables.
To help you figure out what was undefined, perl will try to tell you the
name of the variable (if any) that was undefined. In some cases it cannot
do this, so it also tells you what operation you used the undefined value
in. Note, however, that perl optimizes your program and the operation
displayed in the warning may not necessarily appear literally in your
program. For example, "that $foo" is usually optimized into "that "
. $foo, and the warning will refer to the concatenation (.) operator,
even though there is no . in your program.
Use of uninitialized value in concatenation (.) or string at perl_script_six.pl line 99 (#1)
Wide character in print at perl_script_six.pl line 99 (#2)
(S utf8) Perl met a wide character (>255) when it wasn't expecting
one. This warning is by default on for I/O (like print). The easiest
way to quiet this warning is simply to add the :utf8 layer to the
output, e.g. binmode STDOUT, ':utf8'. Another way to turn off the
warning is to add no warnings 'utf8'; but that is often closer to
cheating. In general, you are supposed to explicitly mark the
filehandle with an encoding, see open and perlfunc/binmode.
Wide character in print at perl_script_six.pl line 90 (#2)
Use of uninitialized value in concatenation (.) or string at
perl_script_six.pl
line 90 (#1)
Use of uninitialized value in concatenation (.) or string at perl_script_six.pl
line 91 (#1)
suse-linux:/usr/perl #
Quote
htmlfiles/einzelergebnis987b-2.html!
Bearbeite nun datei: htmlfiles/einzelergebnise20b.html!
unexpected format: parsing stopped at perl_script_six.pl line 59.
Bearbeite nun datei: htmlfiles/einzelergebnised05.html!
Bearbeite nun datei: htmlfiles/einzelergebnisdec3.html!
Use of uninitialized value in concatenation (.) or string at perl_script_six.pl line 92 (#1)
(W uninitialized) An undefined value was used as if it were already
defined. It was interpreted as a "" or a 0, but maybe it was a mistake.
To suppress this warning assign a defined value to your variables.
To help you figure out what was undefined, perl will try to tell you the
name of the variable (if any) that was undefined. In some cases it cannot
do this, so it also tells you what operation you used the undefined value
in. Note, however, that perl optimizes your program and the operation
displayed in the warning may not necessarily appear literally in your
program. For example, "that $foo" is usually optimized into "that "
$foo, and the warning will refer to the concatenation (.) operator,
even though there is no . in your program.
Use of uninitialized value in concatenation (.) or string at perl_script_six.pl
line 99 (#1)
Wide character in print at perl_script_six.pl line 99 (#2)
(S utf8) Perl met a wide character (>255) when it wasn't expecting
one. This warning is by default on for I/O (like print). The easiest
way to quiet this warning is simply to add the :utf8 layer to the
output, e.g. binmode STDOUT, ':utf8'. Another way to turn off the
warning is to add no warnings 'utf8'; but that is often closer to
cheating. In general, you are supposed to explicitly mark the
filehandle with an encoding, see open and perlfunc/binmode.
Wide character in print at perl_script_six.pl line 90 (#2)
Use of uninitialized value in concatenation (.) or string at perl_script_six.pl
line 90 (#1)
Use of uninitialized value in concatenation (.) or string at perl_script_six.pl
line 91 (#1)
suse-linux:/usr/perl #
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
#!/usr/bin/perl use strict; # alles muss definiert sein use warnings; # wenn etwas nicht so ganz richtig ist warnen use diagnostics; # wenn etwas nicht passt ist warnen use File::Find::Rule; # finde Dateien/Verzeichnisse anhand von Regeln use HTML::TokeParser; # parse HTML-Dateien zum leichten auslesen von Daten # Array in der alle Schulen mit ihren Daten gespeichert werden sollen my @schools; # Das Verzeichnis im dem gesucht werden soll my $search_dir='.'; # ist das aktuelle Arbeitsverzeichnis # die Datei in die alles gespeichert werden soll my $out_file='./output.xml'; # Suche nach bestimmten Dateinamen my @files= File::Find::Rule->file() # suche eine Datei ->name('einzelergebnis*.html') # die mit "einzelergebnis" (alles klein geschieben!) beginnt und mit ".html" endet ->in($search_dir); # suche in dem Verzeichnis #gehe alle gefundenen Dateien durch for my $file (@files) { # Ausgabe, damit man weiß waw passiert. print "Bearbeite nun datei: $file!\n"; # Speichrort für die Schuldaten in dieser Datei my %school; # starte seine neue Parser-Instanz mit der Datei als Quelle my $p = HTML::TokeParser->new($file) or die "Can't open $file: ($!)"; #solange ein Tag von Typ 'div' gefunden wird while (my $tag = $p->get_tag('div', '/html')) { # first move to the right div that contains the information last if $tag->[0] eq '/html'; next unless exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'inhalt_large'; $p->get_tag('h1'); $school{'location'} = $p->get_text('/h1'); while (my $tag = $p->get_tag('div')) { last if exists $tag->[1]{'id'} and $tag->[1]{'id'} eq 'fusszeile'; # get the school name from the heading next unless exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'fm_linkeSpalte'; $p->get_tag('h2'); $school{'name'} = $p->get_text('/h2'); # verify format for school type $tag = $p->get_tag('span'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'schulart_text') { warn "unexpected format: parsing stopped"; last; } $school{'type'} = $p->get_text('/span'); # verify format for address $tag = $p->get_tag('p'); unless (exists $tag->[1]{'class'} and $tag->[1]{'class'} eq 'einzel_text') { warn "unexpected format: parsing stopped"; last; } $school{'address'} = clean_address($p->get_text('/p')); # find the description $tag = $p->get_tag('p'); $school{'description'} = $p->get_text('/p'); } } # speichere eine refenz auf den Hash mit den Daten der aktuellen Schule im Array mit allen Schulen push(@schools,\%school); } # Ausgabe in eine Datei als einfaches "XML" formatiert: open(my $fh, '>', $out_file) or die("Error open $out_file ($!)\n"); print $fh "<schools>\n"; for my $school (@schools) { print $fh " <school>\n"; print $fh " <name>$school->{name}</name>\n"; print $fh " <location>$school->{location}</location>\n"; print $fh " <type>$school->{type}<type>\n"; print $fh " <address>\n"; for my $address (@{$school->{address}}) { print $fh " <entry>$address</entry>\n"; } print $fh " </address>\n"; print $fh " <description>$school->{description}</description>\n"; print $fh " </school>\n"; } print $fh "</schools>\n"; close($fh); ########################################################################## # Funktion um die Adressen von unnötigen Zeichen zu befreihen # und als Array jede Zeile zurück zu liefern sub clean_address { my $text = shift; my @lines = split "\n", $text; for (@lines) { s/^s+//; s/s+$//; } return \@lines; }
2010-10-04T22:25:48 topegKannst du dir wenigstens vorstellen woher die nicht definierten Werte kommen? Warum die Warnung "unexpected format: parsing stopped at perl_script_six.pl line 59." auftauchte und was dir "diagnostic" im zweiten Quote sagen will. Ich übernehme nicht deine Arbeit. Lernen musst du schon selber. :-)
QuoteErgänzen würde ich hier noch: homepage (Link auf die Homepage der Schule)
QuoteHeidelberger Fremdsprachen-Institut, Akademie für Sprachen, Wirtschaft und Touristik
Schulart: Berufsbildungseinrichtung
Adresse: 69120 Heidelberg, Helmholtzstrasse 8
Telefon: +49 (0)-6221 6442-0, Fax: +49 (0)-6221 6442-42
SchulWeb-Nummer: 6912001
Sekretariat: Frau Maureen Batt
QuoteName: HIS - Heidelberg.
Schulart:
Adresse: wie etwa hier in einer Zeile: (69120 Heidelberg, Helmholtzstrasse 8) Siehe unten!!
Telefon: +49 (0)-6221 6442-0, Fax: +49 (0)-6221 6442-42 - alles in einer Zeile drinne!
SchulWeb-Nummer: 6912001 - siehe oben {muss ich extra dekodieren]
Im SchulWeb seit: 16.10.2001
1 2 3
# find the description $tag = $p->get_tag('p'); $school{'description'} = $p->get_text('/p');
Quotefm_projekte
ta_left einzel_text (description) -wobei das schon dabei ist... das ist schon berücksichtigt
logo_schulzeitung
1 2 3
# find the description $tag = $p->get_tag('p'); $school{'description'} = $p->get_text('/p');
Quotefm_projekte
ta_left einzel_text (description) -wobei das schon dabei ist... das ist schon berücksichtigt
logo_schulzeitung
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
my @char_list = get_separator( path => $csv_path, exclude => $array1_ref, # optional include => $array2_ref, # optional echo => 1, # optional ); my $separator; if (@char_list) { if (@char_list == 1) { # successful detection $separator = $char_list[0]; } else { # several candidates passed the tests # Some code here } else { # no candidate passed the tests # Some code here } # "I'm Feeling Lucky" alternative interface # Don't forget to include the 'lucky' parameter my $separator = get_separator( path => $csv_path, lucky => 1, exclude => $array1_ref, # optional include => $array2_ref, # optional echo => 1, # optional );
QuoteThis module provides a fast detection of the field separator character (also called field delimiter) of a CSV file, or more generally, of a character separated text file (also called delimited text file), and returns it ready to use in a CSV parser (e.g., Text::CSV_XS, Tie::CSV_File, or Text::CSV::Simple). This may be useful to the vulnerable -and often ignored- population of programmers who need to process automatically CSV files from different sources. The default set of candidates contains the following characters: ',' ';' ':' '|' '\t'