1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
my $content = ''; if($data->{'content'} && $data->{'charset'}){ if($data->{'charset'} eq 'utf-8'){ $data->{'content'} = decode("utf-8",$data->{'content'}); # make perl characters, Encode.pm $data->{'content'} = decode_entities($data->{'content'}); # add signs from HTML::Entities $content = encode("utf-8",$data->{'content'}); # make octets for disk-writing, Encode.pm }else{ my $length = from_to($data->{'content'}, find_encoding($data->{'charset'}), "utf-8"); if($length){ $data->{'content'} = decode("utf-8",$data->{'content'}); # make perl characters, Encode.pm $data->{'content'} = decode_entities($data->{'content'}); # add signs from HTML::Entities $content = encode("utf-8",$data->{'content'}); # make octets for disk-writing, Encode.pm }else{ _log_message("Charset $data->{'charset'} not supported", 1); } } } if(length($content)){ $p->parse($content); # HTML::Parser ( can('utf8_mode') && $p->utf8_mode(1) ) }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
my $content = ''; if($data->{'content'} && $data->{'charset'}){ if($data->{'charset'} eq 'utf-8'){ $data->{'content'} = decode("utf-8",$data->{'content'}); # make perl characters, Encode.pm $data->{'content'} = decode_entities($data->{'content'}); # add signs from HTML::Entities $content = $data->{'content'}; }else{ my $length = from_to($data->{'content'}, find_encoding($data->{'charset'}), "utf-8"); if($length){ $data->{'content'} = decode("utf-8",$data->{'content'}); # make perl characters, Encode.pm $data->{'content'} = decode_entities($data->{'content'}); # add signs from HTML::Entities $content = $data->{'content'}; }else{ _log_message("Charset $data->{'charset'} not supported", 1); } } } if(length($content)){ $p->parse($content); # HTML::Parser ( can('utf8_mode') && $p->utf8_mode(1) ) }
QuoteWide character in subroutine entry at ./crawler line 156 (#1)
(W utf8) Perl met a wide character (>255) when it wasn't expecting
one. This warning is by default on for I/O (like print). The easiest
way to quiet this warning is simply to add the :utf8 layer to the
output, e.g. binmode STDOUT, ':utf8'. Another way to turn off the
warning is to add no warnings 'utf8'; but that is often closer to
cheating. In general, you are supposed to explicitly mark the
filehandle with an encoding, see open and perlfunc/binmode.
Uncaught exception from user code:
Wide character in subroutine entry at ./crawler line 156.
at ./crawler line 155
$sql_res = $sth_1->execute($data->{'url'}, $title, $keywords, $description, "@all_words") or die($sth_1->errstr());
$sql_res = $sth_1->execute(decode("utf-8",$data->{'url'}), decode("utf-8",$title), decode("utf-8",$keywords), decode("utf-8",$description), decode("utf-8","@all_words")) or die($sth_1->errstr());
2009-10-13T21:35:54 pqbei sowas hilft immer Devel::Peek, um zu gucken, was genau für bytes in dem string stehen und ob er das utf8-flag hat.
1 2 3 4 5 6 7 8 9
if($res->is_success){ $result->{'url'} = $res->request->uri; $result->{'charset'} = _suggest_charset($res); $result->{'content'} = $res->decoded_content(charset => $result->{'charset'}); return $result; }else{ ..... }