Quotehttps://metacpan.org/release/OALDERS/HTML-Parser-3...decode_entities( $string, ... )
This routine replaces HTML entities found in the $string with the corresponding Unicode character.
1 2 3 4 5
use strict; use warnings; use 5.020; use utf8; use HTML::Entities (); my $s = HTML::Entities::decode_entities("ä"); say $s =~ /\xE4/;
1
2
3
4
5
6
7
8
9
10
11
12
use strict; use warnings; use 5.020;
use utf8;
use HTML::Entities ();
use Devel::Peek;
my $s = HTML::Entities::decode_entities("ä");
Dump "⛄$s";
Dump $s;
utf8::upgrade($s);
say $s =~ /\xE4/;
Dump $s;
say $s =~ /\xE4/;
2024-03-19T11:49:29 GwenDragonAber \xE4 ist eben das ä in Latin1 und nicht Unicode.
1 2 3 4 5 6 7 8 9 10 11 12
my $contains_term = 0; eval { use HTML::HTML5::Entities; use Encode qw/decode encode/; $terms = decode('UTF-8',$terms); # in Perls interne Zeichenkette $contents = decode('UTF-8',$contents); # in Perls interne Zeichenkette $contents = HTML::HTML5::Entities::decode($contents); # HTML Entities dekodieren $contains_term = ($contents =~ /$terms/); # Matcht es? };
2024-03-21T12:19:44 rostiMein Tipp: Bleibe bei der Bytesemantic.Dann muß lediglich die Kodierung im Suchausdruck dieselbe sein wie die im zu durchsuchenden Text.
1 2 3 4 5 6 7 8 9 10 11 12 13
use 5.032; my $e = "\N{EURO SIGN}"; my $term = '^.$'; my $term3 = '^...$'; say "Characters mit '$term': " , $e =~ /$term/; # 1 say "Characters mit '$term3': ", $e =~ /$term3/; # kein Match { use bytes; say "Bytes mit '$term': ", $e =~ /$term/; # kein Match say "Bytes mit '$term3': ", $e =~ /$term3/; # 1 }
2024-03-21T11:38:40 GwenDragonHab ich was vergessen?
1
2
3
4
5
6
7
8
9
10
root@srv ~ # hd < t.txt
00000000 54 65 73 74 20 66 c3 bc 72 20 55 6d 6c 61 75 74 |Test f..r Umlaut|
00000010 65 3f 0a 6d 65 74 61 2d 63 72 65 61 74 69 6f 6e |e?.meta-creation|
00000020 5f 64 61 74 65 3a 20 32 30 32 34 2d 30 33 2d 32 |_date: 2024-03-2|
00000030 31 54 31 35 3a 35 34 3a 32 34 0a 0a 46 c3 bc c3 |1T15:54:24..F...|
00000040 9f 65 20 28 61 6c 73 20 55 54 46 2d 38 29 20 75 |.e (als UTF-8) u|
00000050 6e 64 20 26 61 75 6d 6c 3b 26 6f 75 6d 6c 3b 26 |nd äö&|
00000060 75 75 6d 6c 3b 20 28 6b 6f 64 69 65 72 74 20 61 |uuml; (kodiert a|
00000070 6c 73 20 48 54 4d 4c 2d 45 6e 74 69 74 c3 a4 74 |ls HTML-Entit..t|
00000080 29
2024-03-21T18:14:18 GwenDragonIch benutze eigentlich immer Encode::decode('UTF-8', ...) wenn die Datei UTF-8 ist.
Da ist doch dann wohl das Pragma CGI qw/-utf8/ obsolet, oder.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
use 5.024; use autodie; use Encode qw( decode encode ); use HTML::Entities; use URI::Escape; open (my $fh,'<','t.txt'); my $content; { local $/ = undef; $content = <$fh>; } $content = decode('UTF-8',$content,Encode::FB_CROAK); $content = decode_entities($content); my @terms = ( "\N{LATIN SMALL LETTER A WITH DIAERESIS}", "\x{e4}", chr 228, decode('UTF-8',"\xc3\xa4"), decode('UTF-8',uri_unescape("%c3%a4")), ); for my $term (@terms) { my $for_terminal = encode('UTF-8',$term); while ($content =~ /$term/g) { say "Treffer: '$for_terminal' an Position " . pos $content; } }
1
2
3
4
5
6
7
8
9
10
Treffer: 'ä' an Position 81
Treffer: 'ä' an Position 108
Treffer: 'ä' an Position 81
Treffer: 'ä' an Position 108
Treffer: 'ä' an Position 81
Treffer: 'ä' an Position 108
Treffer: 'ä' an Position 81
Treffer: 'ä' an Position 108
Treffer: 'ä' an Position 81
Treffer: 'ä' an Position 108
Quote$content = decode('UTF-8',$content,Encode::FB_CROAK);
1
2
response.i = 1; // Versuch einer Zuweisung
'class Response' has no member named 'i' // sagt Compiler