1
2
3
4
$ echo -e "\x55\xcc\x88"|utf8-dump
U\N{COMBINING DIAERESIS}
echo -e "\x55\xcc\x88"| perl -CS -MUnicode::Normalize -pE '$_ = NFC($_)'|utf8-dump
\N{LATIN CAPITAL LETTER U WITH DIAERESIS}
1 2 3 4 5 6 7 8
... s{([^\0-\177])}{N_escape($1)}eg; ... sub N_escape { my $n = charnames::viacode(ord($_[0])); return defined($n) ? "\\N{$n}" : sprintf('\x{%x}', ord($_[0])); } ...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#!/usr/bin/perl use strict; use warnings; use charnames (); use Encode qw(encode decode_utf8); use Unicode::Normalize; our $s1 = "\x55\xcc\x88bel \xc3\x9cbel gro\xc3\x9fe"; print STDERR "string: $s1 (" . &my_print($s1) . ")\n\n"; my $s2 = decode_utf8($s1); print STDERR "decode_utf8: $s2 (" . &my_print($s2) . ")\n\n"; my $s2b = encode("iso-8859-1", decode_utf8($s1)); print STDERR "ISO-8859-1: $s2b (" . &my_print($s2b) . ")\n\n"; my $s3 = NFC($s2); print STDERR "NFC: $s3 (" . &my_print($s3) . ")\n\n"; my $s4 = $s3; $s4 =~ s{([^\0-\177])}{N_escape($1)}eg; print STDERR "Unicode names: $s4 (...)\n\n"; exit; sub N_escape { my $n = charnames::viacode(ord($_[0])); return defined($n) ? "\\N{$n}" : sprintf('\x{%x}', ord($_[0])); } sub my_print { my $string = $_[0]; my $newstring = ''; my @chars = split(//, $string ) ; for my $char (@chars) { $newstring .= sprintf('\x{%x}', ord($char)) } return $newstring }
1 2 3 4 5 6 7 8 9 10
string: Übel Übel große (\x{55}\x{cc}\x{88}\x{62}\x{65}\x{6c}\x{20}\x{c3}\x{9c}\x{62}\x{65}\x{6c}\x{20}\x{67}\x{72}\x{6f}\x{c3}\x{9f}\x{65}) Wide character in print at C:\projekte\perl\unicode\uni_test5.pl line 12. decode_utf8: Übel Ãœbel große (\x{55}\x{308}\x{62}\x{65}\x{6c}\x{20}\x{dc}\x{62}\x{65}\x{6c}\x{20}\x{67}\x{72}\x{6f}\x{df}\x{65}) ISO-8859-1: U?bel Übel große (\x{55}\x{3f}\x{62}\x{65}\x{6c}\x{20}\x{dc}\x{62}\x{65}\x{6c}\x{20}\x{67}\x{72}\x{6f}\x{df}\x{65}) NFC: Übel Übel große (\x{dc}\x{62}\x{65}\x{6c}\x{20}\x{dc}\x{62}\x{65}\x{6c}\x{20}\x{67}\x{72}\x{6f}\x{df}\x{65}) Unicode names: \N{LATIN CAPITAL LETTER U WITH DIAERESIS}bel \N{LATIN CAPITAL LETTER U WITH DIAERESIS}bel gro\N{LATIN SMALL LETTER SHARP S}e (...)
2012-02-25T20:28:12 perlotFrage noch an Moritz, Du benutzt folgenden Code.
Code (perl): (dl )1 2 3 4 5 6 7 8... s{([^\0-\177])}{N_escape($1)}eg; ... sub N_escape { my $n = charnames::viacode(ord($_[0])); return defined($n) ? "\\N{$n}" : sprintf('\x{%x}', ord($_[0])); } ...
Wie schafft es der Ersetzungsstring, dass 3 Bytes zu einem Charname aufgelöst werden können, obwohl N_escape() nur zeichenweise abarbeitet.
Magie?