ljr/local/cgi-bin/LJR/unicode.pm

54 lines
1.1 KiB
Perl

use strict;
package LJR::unicode;
use XML::Parser;
use Unicode::MapUTF8 qw(to_utf8 from_utf8 utf8_supported_charset);
sub utf8ize {
my $text_in = shift;
$$text_in = pack("C*", unpack("C*", $$text_in)) if $$text_in;
}
sub force_utf8 {
my $xdata = shift;
my %error_lines;
my $finished = 0;
my @xlines;
my $orig_xdata = $$xdata;
my $p1 = new XML::Parser ();
while (!$finished) {
eval { $p1->parse($$xdata); };
if ($@ && $@ =~ /not\ well\-formed\ \(invalid\ token\)\ at\ line\ (\d+)\,/) {
my $error_line = $1;
$error_lines{$error_line} ++;
if ($error_lines{$error_line} > 1) {
$$xdata = $orig_xdata;
$finished = 1;
}
else {
@xlines = split(/\n/, $$xdata);
my $output = to_utf8({ -string => $xlines[$error_line - 1], -charset => 'latin1' });
$xlines[$error_line - 1] = $output;
$$xdata = join("\n", @xlines);
}
}
# unknown error or no error, doesn't matter
elsif ($@) {
$$xdata = $orig_xdata;
$finished = 1;
}
else {
$finished = 1;
}
}
}
return 1;