| Filename | /usr/lib/x86_64-linux-gnu/perl/5.20/Encode.pm |
| Statements | Executed 0 statements in 0s |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | # | ||||
| 2 | # $Id: Encode.pm,v 2.60 2014/04/29 16:26:49 dankogai Exp dankogai $ | ||||
| 3 | # | ||||
| 4 | package Encode; | ||||
| 5 | use strict; | ||||
| 6 | use warnings; | ||||
| 7 | our $VERSION = sprintf "%d.%02d", q$Revision: 2.60_01 $ =~ /(\d+)/g; | ||||
| 8 | use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; | ||||
| 9 | use XSLoader (); | ||||
| 10 | XSLoader::load( __PACKAGE__, $VERSION ); | ||||
| 11 | |||||
| 12 | use Exporter 5.57 'import'; | ||||
| 13 | |||||
| 14 | # Public, encouraged API is exported by default | ||||
| 15 | |||||
| 16 | our @EXPORT = qw( | ||||
| 17 | decode decode_utf8 encode encode_utf8 str2bytes bytes2str | ||||
| 18 | encodings find_encoding clone_encoding | ||||
| 19 | ); | ||||
| 20 | our @FB_FLAGS = qw( | ||||
| 21 | DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC | ||||
| 22 | PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL | ||||
| 23 | ); | ||||
| 24 | our @FB_CONSTS = qw( | ||||
| 25 | FB_DEFAULT FB_CROAK FB_QUIET FB_WARN | ||||
| 26 | FB_PERLQQ FB_HTMLCREF FB_XMLCREF | ||||
| 27 | ); | ||||
| 28 | our @EXPORT_OK = ( | ||||
| 29 | qw( | ||||
| 30 | _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit | ||||
| 31 | is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade | ||||
| 32 | ), | ||||
| 33 | @FB_FLAGS, @FB_CONSTS, | ||||
| 34 | ); | ||||
| 35 | |||||
| 36 | our %EXPORT_TAGS = ( | ||||
| 37 | all => [ @EXPORT, @EXPORT_OK ], | ||||
| 38 | default => [ @EXPORT ], | ||||
| 39 | fallbacks => [ @FB_CONSTS ], | ||||
| 40 | fallback_all => [ @FB_CONSTS, @FB_FLAGS ], | ||||
| 41 | ); | ||||
| 42 | |||||
| 43 | # Documentation moved after __END__ for speed - NI-S | ||||
| 44 | |||||
| 45 | our $ON_EBCDIC = ( ord("A") == 193 ); | ||||
| 46 | |||||
| 47 | use Encode::Alias; | ||||
| 48 | |||||
| 49 | # Make a %Encoding package variable to allow a certain amount of cheating | ||||
| 50 | our %Encoding; | ||||
| 51 | our %ExtModule; | ||||
| 52 | require Encode::Config; | ||||
| 53 | # See | ||||
| 54 | # https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 | ||||
| 55 | # to find why sig handlers inside eval{} are disabled. | ||||
| 56 | eval { | ||||
| 57 | local $SIG{__DIE__}; | ||||
| 58 | local $SIG{__WARN__}; | ||||
| 59 | require Encode::ConfigLocal; | ||||
| 60 | }; | ||||
| 61 | |||||
| 62 | sub encodings { | ||||
| 63 | my %enc; | ||||
| 64 | my $arg = $_[1] || ''; | ||||
| 65 | if ( $arg eq ":all" ) { | ||||
| 66 | %enc = ( %Encoding, %ExtModule ); | ||||
| 67 | } | ||||
| 68 | else { | ||||
| 69 | %enc = %Encoding; | ||||
| 70 | for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { | ||||
| 71 | DEBUG and warn $mod; | ||||
| 72 | for my $enc ( keys %ExtModule ) { | ||||
| 73 | $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; | ||||
| 74 | } | ||||
| 75 | } | ||||
| 76 | } | ||||
| 77 | return sort { lc $a cmp lc $b } | ||||
| 78 | grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; | ||||
| 79 | } | ||||
| 80 | |||||
| 81 | sub perlio_ok { | ||||
| 82 | my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); | ||||
| 83 | $obj->can("perlio_ok") and return $obj->perlio_ok(); | ||||
| 84 | return 0; # safety net | ||||
| 85 | } | ||||
| 86 | |||||
| 87 | sub define_encoding { | ||||
| 88 | my $obj = shift; | ||||
| 89 | my $name = shift; | ||||
| 90 | $Encoding{$name} = $obj; | ||||
| 91 | my $lc = lc($name); | ||||
| 92 | define_alias( $lc => $obj ) unless $lc eq $name; | ||||
| 93 | while (@_) { | ||||
| 94 | my $alias = shift; | ||||
| 95 | define_alias( $alias, $obj ); | ||||
| 96 | } | ||||
| 97 | return $obj; | ||||
| 98 | } | ||||
| 99 | |||||
| 100 | sub getEncoding { | ||||
| 101 | my ( $class, $name, $skip_external ) = @_; | ||||
| 102 | |||||
| 103 | $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 | ||||
| 104 | |||||
| 105 | ref($name) && $name->can('renew') and return $name; | ||||
| 106 | exists $Encoding{$name} and return $Encoding{$name}; | ||||
| 107 | my $lc = lc $name; | ||||
| 108 | exists $Encoding{$lc} and return $Encoding{$lc}; | ||||
| 109 | |||||
| 110 | my $oc = $class->find_alias($name); | ||||
| 111 | defined($oc) and return $oc; | ||||
| 112 | $lc ne $name and $oc = $class->find_alias($lc); | ||||
| 113 | defined($oc) and return $oc; | ||||
| 114 | |||||
| 115 | unless ($skip_external) { | ||||
| 116 | if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { | ||||
| 117 | $mod =~ s,::,/,g; | ||||
| 118 | $mod .= '.pm'; | ||||
| 119 | eval { require $mod; }; | ||||
| 120 | exists $Encoding{$name} and return $Encoding{$name}; | ||||
| 121 | } | ||||
| 122 | } | ||||
| 123 | return; | ||||
| 124 | } | ||||
| 125 | |||||
| 126 | sub find_encoding($;$) { | ||||
| 127 | my ( $name, $skip_external ) = @_; | ||||
| 128 | return __PACKAGE__->getEncoding( $name, $skip_external ); | ||||
| 129 | } | ||||
| 130 | |||||
| 131 | sub resolve_alias($) { | ||||
| 132 | my $obj = find_encoding(shift); | ||||
| 133 | defined $obj and return $obj->name; | ||||
| 134 | return; | ||||
| 135 | } | ||||
| 136 | |||||
| 137 | sub clone_encoding($) { | ||||
| 138 | my $obj = find_encoding(shift); | ||||
| 139 | ref $obj or return; | ||||
| 140 | eval { require Storable }; | ||||
| 141 | $@ and return; | ||||
| 142 | return Storable::dclone($obj); | ||||
| 143 | } | ||||
| 144 | |||||
| 145 | sub encode($$;$) { | ||||
| 146 | my ( $name, $string, $check ) = @_; | ||||
| 147 | return undef unless defined $string; | ||||
| 148 | 12 | 24µs | $string .= ''; # stringify; # spent 24µs making 12 calls to Text::MicroTemplate::EncodedString::__ANON__, avg 2µs/call | ||
| 149 | $check ||= 0; | ||||
| 150 | unless ( defined $name ) { | ||||
| 151 | require Carp; | ||||
| 152 | Carp::croak("Encoding name should not be undef"); | ||||
| 153 | } | ||||
| 154 | my $enc = find_encoding($name); | ||||
| 155 | unless ( defined $enc ) { | ||||
| 156 | require Carp; | ||||
| 157 | Carp::croak("Unknown encoding '$name'"); | ||||
| 158 | } | ||||
| 159 | my $octets = $enc->encode( $string, $check ); | ||||
| 160 | $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); | ||||
| 161 | return $octets; | ||||
| 162 | } | ||||
| 163 | *str2bytes = \&encode; | ||||
| 164 | |||||
| 165 | sub decode($$;$) { | ||||
| 166 | my ( $name, $octets, $check ) = @_; | ||||
| 167 | return undef unless defined $octets; | ||||
| 168 | $octets .= ''; | ||||
| 169 | $check ||= 0; | ||||
| 170 | my $enc = find_encoding($name); | ||||
| 171 | unless ( defined $enc ) { | ||||
| 172 | require Carp; | ||||
| 173 | Carp::croak("Unknown encoding '$name'"); | ||||
| 174 | } | ||||
| 175 | 90 | 208µs | my $string = $enc->decode( $octets, $check ); # spent 208µs making 90 calls to Encode::Encoding::renewed, avg 2µs/call | ||
| 176 | $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); | ||||
| 177 | return $string; | ||||
| 178 | } | ||||
| 179 | *bytes2str = \&decode; | ||||
| 180 | |||||
| 181 | sub from_to($$$;$) { | ||||
| 182 | my ( $string, $from, $to, $check ) = @_; | ||||
| 183 | return undef unless defined $string; | ||||
| 184 | $check ||= 0; | ||||
| 185 | my $f = find_encoding($from); | ||||
| 186 | unless ( defined $f ) { | ||||
| 187 | require Carp; | ||||
| 188 | Carp::croak("Unknown encoding '$from'"); | ||||
| 189 | } | ||||
| 190 | my $t = find_encoding($to); | ||||
| 191 | unless ( defined $t ) { | ||||
| 192 | require Carp; | ||||
| 193 | Carp::croak("Unknown encoding '$to'"); | ||||
| 194 | } | ||||
| 195 | my $uni = $f->decode($string); | ||||
| 196 | $_[0] = $string = $t->encode( $uni, $check ); | ||||
| 197 | return undef if ( $check && length($uni) ); | ||||
| 198 | return defined( $_[0] ) ? length($string) : undef; | ||||
| 199 | } | ||||
| 200 | |||||
| 201 | sub encode_utf8($) { | ||||
| 202 | my ($str) = @_; | ||||
| 203 | utf8::encode($str); | ||||
| 204 | return $str; | ||||
| 205 | } | ||||
| 206 | |||||
| 207 | my $utf8enc; | ||||
| 208 | |||||
| 209 | sub decode_utf8($;$) { | ||||
| 210 | my ( $octets, $check ) = @_; | ||||
| 211 | return undef unless defined $octets; | ||||
| 212 | $octets .= ''; | ||||
| 213 | $check ||= 0; | ||||
| 214 | $utf8enc ||= find_encoding('utf8'); | ||||
| 215 | my $string = $utf8enc->decode( $octets, $check ); | ||||
| 216 | $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); | ||||
| 217 | return $string; | ||||
| 218 | } | ||||
| 219 | |||||
| 220 | # sub decode_utf8($;$) { | ||||
| 221 | # my ( $str, $check ) = @_; | ||||
| 222 | # return $str if is_utf8($str); | ||||
| 223 | # if ($check) { | ||||
| 224 | # return decode( "utf8", $str, $check ); | ||||
| 225 | # } | ||||
| 226 | # else { | ||||
| 227 | # return decode( "utf8", $str ); | ||||
| 228 | # return $str; | ||||
| 229 | # } | ||||
| 230 | # } | ||||
| 231 | |||||
| 232 | predefine_encodings(1); | ||||
| 233 | |||||
| 234 | # | ||||
| 235 | # This is to restore %Encoding if really needed; | ||||
| 236 | # | ||||
| 237 | |||||
| 238 | sub predefine_encodings { | ||||
| 239 | require Encode::Encoding; | ||||
| 240 | no warnings 'redefine'; | ||||
| 241 | my $use_xs = shift; | ||||
| 242 | if ($ON_EBCDIC) { | ||||
| 243 | |||||
| 244 | # was in Encode::UTF_EBCDIC | ||||
| 245 | package Encode::UTF_EBCDIC; | ||||
| 246 | push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; | ||||
| 247 | *decode = sub { | ||||
| 248 | my ( undef, $str, $chk ) = @_; | ||||
| 249 | my $res = ''; | ||||
| 250 | for ( my $i = 0 ; $i < length($str) ; $i++ ) { | ||||
| 251 | $res .= | ||||
| 252 | chr( | ||||
| 253 | utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) | ||||
| 254 | ); | ||||
| 255 | } | ||||
| 256 | $_[1] = '' if $chk; | ||||
| 257 | return $res; | ||||
| 258 | }; | ||||
| 259 | *encode = sub { | ||||
| 260 | my ( undef, $str, $chk ) = @_; | ||||
| 261 | my $res = ''; | ||||
| 262 | for ( my $i = 0 ; $i < length($str) ; $i++ ) { | ||||
| 263 | $res .= | ||||
| 264 | chr( | ||||
| 265 | utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) | ||||
| 266 | ); | ||||
| 267 | } | ||||
| 268 | $_[1] = '' if $chk; | ||||
| 269 | return $res; | ||||
| 270 | }; | ||||
| 271 | $Encode::Encoding{Unicode} = | ||||
| 272 | bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; | ||||
| 273 | } | ||||
| 274 | else { | ||||
| 275 | |||||
| 276 | package Encode::Internal; | ||||
| 277 | push @Encode::Internal::ISA, 'Encode::Encoding'; | ||||
| 278 | *decode = sub { | ||||
| 279 | my ( undef, $str, $chk ) = @_; | ||||
| 280 | utf8::upgrade($str); | ||||
| 281 | $_[1] = '' if $chk; | ||||
| 282 | return $str; | ||||
| 283 | }; | ||||
| 284 | *encode = \&decode; | ||||
| 285 | $Encode::Encoding{Unicode} = | ||||
| 286 | bless { Name => "Internal" } => "Encode::Internal"; | ||||
| 287 | } | ||||
| 288 | |||||
| 289 | { | ||||
| 290 | |||||
| 291 | # was in Encode::utf8 | ||||
| 292 | package Encode::utf8; | ||||
| 293 | push @Encode::utf8::ISA, 'Encode::Encoding'; | ||||
| 294 | |||||
| 295 | # | ||||
| 296 | if ($use_xs) { | ||||
| 297 | Encode::DEBUG and warn __PACKAGE__, " XS on"; | ||||
| 298 | *decode = \&decode_xs; | ||||
| 299 | *encode = \&encode_xs; | ||||
| 300 | } | ||||
| 301 | else { | ||||
| 302 | Encode::DEBUG and warn __PACKAGE__, " XS off"; | ||||
| 303 | *decode = sub { | ||||
| 304 | my ( undef, $octets, $chk ) = @_; | ||||
| 305 | my $str = Encode::decode_utf8($octets); | ||||
| 306 | if ( defined $str ) { | ||||
| 307 | $_[1] = '' if $chk; | ||||
| 308 | return $str; | ||||
| 309 | } | ||||
| 310 | return undef; | ||||
| 311 | }; | ||||
| 312 | *encode = sub { | ||||
| 313 | my ( undef, $string, $chk ) = @_; | ||||
| 314 | my $octets = Encode::encode_utf8($string); | ||||
| 315 | $_[1] = '' if $chk; | ||||
| 316 | return $octets; | ||||
| 317 | }; | ||||
| 318 | } | ||||
| 319 | *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) | ||||
| 320 | # currently ignores $chk | ||||
| 321 | my ( undef, undef, undef, $pos, $trm ) = @_; | ||||
| 322 | my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; | ||||
| 323 | use bytes; | ||||
| 324 | if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { | ||||
| 325 | $$rdst .= | ||||
| 326 | substr( $$rsrc, $pos, $npos - $pos + length($trm) ); | ||||
| 327 | $$rpos = $npos + length($trm); | ||||
| 328 | return 1; | ||||
| 329 | } | ||||
| 330 | $$rdst .= substr( $$rsrc, $pos ); | ||||
| 331 | $$rpos = length($$rsrc); | ||||
| 332 | return ''; | ||||
| 333 | }; | ||||
| 334 | $Encode::Encoding{utf8} = | ||||
| 335 | bless { Name => "utf8" } => "Encode::utf8"; | ||||
| 336 | $Encode::Encoding{"utf-8-strict"} = | ||||
| 337 | bless { Name => "utf-8-strict", strict_utf8 => 1 } | ||||
| 338 | => "Encode::utf8"; | ||||
| 339 | } | ||||
| 340 | } | ||||
| 341 | |||||
| 342 | 1; | ||||
| 343 | |||||
| 344 | __END__ | ||||
| 345 | |||||
| 346 | =head1 NAME | ||||
| 347 | |||||
| 348 | Encode - character encodings in Perl | ||||
| 349 | |||||
| 350 | =head1 SYNOPSIS | ||||
| 351 | |||||
| 352 | use Encode qw(decode encode); | ||||
| 353 | $characters = decode('UTF-8', $octets, Encode::FB_CROAK); | ||||
| 354 | $octets = encode('UTF-8', $characters, Encode::FB_CROAK); | ||||
| 355 | |||||
| 356 | =head2 Table of Contents | ||||
| 357 | |||||
| 358 | Encode consists of a collection of modules whose details are too extensive | ||||
| 359 | to fit in one document. This one itself explains the top-level APIs | ||||
| 360 | and general topics at a glance. For other topics and more details, | ||||
| 361 | see the documentation for these modules: | ||||
| 362 | |||||
| 363 | =over 2 | ||||
| 364 | |||||
| 365 | =item L<Encode::Alias> - Alias definitions to encodings | ||||
| 366 | |||||
| 367 | =item L<Encode::Encoding> - Encode Implementation Base Class | ||||
| 368 | |||||
| 369 | =item L<Encode::Supported> - List of Supported Encodings | ||||
| 370 | |||||
| 371 | =item L<Encode::CN> - Simplified Chinese Encodings | ||||
| 372 | |||||
| 373 | =item L<Encode::JP> - Japanese Encodings | ||||
| 374 | |||||
| 375 | =item L<Encode::KR> - Korean Encodings | ||||
| 376 | |||||
| 377 | =item L<Encode::TW> - Traditional Chinese Encodings | ||||
| 378 | |||||
| 379 | =back | ||||
| 380 | |||||
| 381 | =head1 DESCRIPTION | ||||
| 382 | |||||
| 383 | The C<Encode> module provides the interface between Perl strings | ||||
| 384 | and the rest of the system. Perl strings are sequences of | ||||
| 385 | I<characters>. | ||||
| 386 | |||||
| 387 | The repertoire of characters that Perl can represent is a superset of those | ||||
| 388 | defined by the Unicode Consortium. On most platforms the ordinal | ||||
| 389 | values of a character as returned by C<ord(I<S>)> is the I<Unicode | ||||
| 390 | codepoint> for that character. The exceptions are platforms where | ||||
| 391 | the legacy encoding is some variant of EBCDIC rather than a superset | ||||
| 392 | of ASCII; see L<perlebcdic>. | ||||
| 393 | |||||
| 394 | During recent history, data is moved around a computer in 8-bit chunks, | ||||
| 395 | often called "bytes" but also known as "octets" in standards documents. | ||||
| 396 | Perl is widely used to manipulate data of many types: not only strings of | ||||
| 397 | characters representing human or computer languages, but also "binary" | ||||
| 398 | data, being the machine's representation of numbers, pixels in an image, or | ||||
| 399 | just about anything. | ||||
| 400 | |||||
| 401 | When Perl is processing "binary data", the programmer wants Perl to | ||||
| 402 | process "sequences of bytes". This is not a problem for Perl: because a | ||||
| 403 | byte has 256 possible values, it easily fits in Perl's much larger | ||||
| 404 | "logical character". | ||||
| 405 | |||||
| 406 | This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> | ||||
| 407 | explain the I<why>. | ||||
| 408 | |||||
| 409 | =head2 TERMINOLOGY | ||||
| 410 | |||||
| 411 | =head3 character | ||||
| 412 | |||||
| 413 | A character in the range 0 .. 2**32-1 (or more); | ||||
| 414 | what Perl's strings are made of. | ||||
| 415 | |||||
| 416 | =head3 byte | ||||
| 417 | |||||
| 418 | A character in the range 0..255; | ||||
| 419 | a special case of a Perl character. | ||||
| 420 | |||||
| 421 | =head3 octet | ||||
| 422 | |||||
| 423 | 8 bits of data, with ordinal values 0..255; | ||||
| 424 | term for bytes passed to or from a non-Perl context, such as a disk file, | ||||
| 425 | standard I/O stream, database, command-line argument, environment variable, | ||||
| 426 | socket etc. | ||||
| 427 | |||||
| 428 | =head1 THE PERL ENCODING API | ||||
| 429 | |||||
| 430 | =head2 Basic methods | ||||
| 431 | |||||
| 432 | =head3 encode | ||||
| 433 | |||||
| 434 | $octets = encode(ENCODING, STRING[, CHECK]) | ||||
| 435 | |||||
| 436 | Encodes the scalar value I<STRING> from Perl's internal form into | ||||
| 437 | I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a | ||||
| 438 | canonical name or an alias. For encoding names and aliases, see | ||||
| 439 | L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. | ||||
| 440 | |||||
| 441 | For example, to convert a string from Perl's internal format into | ||||
| 442 | ISO-8859-1, also known as Latin1: | ||||
| 443 | |||||
| 444 | $octets = encode("iso-8859-1", $string); | ||||
| 445 | |||||
| 446 | B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then | ||||
| 447 | $octets I<might not be equal to> $string. Though both contain the | ||||
| 448 | same data, the UTF8 flag for $octets is I<always> off. When you | ||||
| 449 | encode anything, the UTF8 flag on the result is always off, even when it | ||||
| 450 | contains a completely valid utf8 string. See L</"The UTF8 flag"> below. | ||||
| 451 | |||||
| 452 | If the $string is C<undef>, then C<undef> is returned. | ||||
| 453 | |||||
| 454 | =head3 decode | ||||
| 455 | |||||
| 456 | $string = decode(ENCODING, OCTETS[, CHECK]) | ||||
| 457 | |||||
| 458 | This function returns the string that results from decoding the scalar | ||||
| 459 | value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into | ||||
| 460 | Perl's internal form. The returns the resulting string. As with encode(), | ||||
| 461 | I<ENCODING> can be either a canonical name or an alias. For encoding names | ||||
| 462 | and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling | ||||
| 463 | Malformed Data">. | ||||
| 464 | |||||
| 465 | For example, to convert ISO-8859-1 data into a string in Perl's | ||||
| 466 | internal format: | ||||
| 467 | |||||
| 468 | $string = decode("iso-8859-1", $octets); | ||||
| 469 | |||||
| 470 | B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string | ||||
| 471 | I<might not be equal to> $octets. Though both contain the same data, the | ||||
| 472 | UTF8 flag for $string is on. See L</"The UTF8 flag"> | ||||
| 473 | below. | ||||
| 474 | |||||
| 475 | If the $string is C<undef>, then C<undef> is returned. | ||||
| 476 | |||||
| 477 | =head3 find_encoding | ||||
| 478 | |||||
| 479 | [$obj =] find_encoding(ENCODING) | ||||
| 480 | |||||
| 481 | Returns the I<encoding object> corresponding to I<ENCODING>. Returns | ||||
| 482 | C<undef> if no matching I<ENCODING> is find. The returned object is | ||||
| 483 | what does the actual encoding or decoding. | ||||
| 484 | |||||
| 485 | $utf8 = decode($name, $bytes); | ||||
| 486 | |||||
| 487 | is in fact | ||||
| 488 | |||||
| 489 | $utf8 = do { | ||||
| 490 | $obj = find_encoding($name); | ||||
| 491 | croak qq(encoding "$name" not found) unless ref $obj; | ||||
| 492 | $obj->decode($bytes); | ||||
| 493 | }; | ||||
| 494 | |||||
| 495 | with more error checking. | ||||
| 496 | |||||
| 497 | You can therefore save time by reusing this object as follows; | ||||
| 498 | |||||
| 499 | my $enc = find_encoding("iso-8859-1"); | ||||
| 500 | while(<>) { | ||||
| 501 | my $utf8 = $enc->decode($_); | ||||
| 502 | ... # now do something with $utf8; | ||||
| 503 | } | ||||
| 504 | |||||
| 505 | Besides L</decode> and L</encode>, other methods are | ||||
| 506 | available as well. For instance, C<name()> returns the canonical | ||||
| 507 | name of the encoding object. | ||||
| 508 | |||||
| 509 | find_encoding("latin1")->name; # iso-8859-1 | ||||
| 510 | |||||
| 511 | See L<Encode::Encoding> for details. | ||||
| 512 | |||||
| 513 | =head3 from_to | ||||
| 514 | |||||
| 515 | [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) | ||||
| 516 | |||||
| 517 | Converts I<in-place> data between two encodings. The data in $octets | ||||
| 518 | must be encoded as octets and I<not> as characters in Perl's internal | ||||
| 519 | format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 | ||||
| 520 | encoding: | ||||
| 521 | |||||
| 522 | from_to($octets, "iso-8859-1", "cp1250"); | ||||
| 523 | |||||
| 524 | and to convert it back: | ||||
| 525 | |||||
| 526 | from_to($octets, "cp1250", "iso-8859-1"); | ||||
| 527 | |||||
| 528 | Because the conversion happens in place, the data to be | ||||
| 529 | converted cannot be a string constant: it must be a scalar variable. | ||||
| 530 | |||||
| 531 | C<from_to()> returns the length of the converted string in octets on success, | ||||
| 532 | and C<undef> on error. | ||||
| 533 | |||||
| 534 | B<CAVEAT>: The following operations may look the same, but are not: | ||||
| 535 | |||||
| 536 | from_to($data, "iso-8859-1", "utf8"); #1 | ||||
| 537 | $data = decode("iso-8859-1", $data); #2 | ||||
| 538 | |||||
| 539 | Both #1 and #2 make $data consist of a completely valid UTF-8 string, | ||||
| 540 | but only #2 turns the UTF8 flag on. #1 is equivalent to: | ||||
| 541 | |||||
| 542 | $data = encode("utf8", decode("iso-8859-1", $data)); | ||||
| 543 | |||||
| 544 | See L</"The UTF8 flag"> below. | ||||
| 545 | |||||
| 546 | Also note that: | ||||
| 547 | |||||
| 548 | from_to($octets, $from, $to, $check); | ||||
| 549 | |||||
| 550 | is equivalent t:o | ||||
| 551 | |||||
| 552 | $octets = encode($to, decode($from, $octets), $check); | ||||
| 553 | |||||
| 554 | Yes, it does I<not> respect the $check during decoding. It is | ||||
| 555 | deliberately done that way. If you need minute control, use C<decode> | ||||
| 556 | followed by C<encode> as follows: | ||||
| 557 | |||||
| 558 | $octets = encode($to, decode($from, $octets, $check_from), $check_to); | ||||
| 559 | |||||
| 560 | =head3 encode_utf8 | ||||
| 561 | |||||
| 562 | $octets = encode_utf8($string); | ||||
| 563 | |||||
| 564 | Equivalent to C<$octets = encode("utf8", $string)>. The characters in | ||||
| 565 | $string are encoded in Perl's internal format, and the result is returned | ||||
| 566 | as a sequence of octets. Because all possible characters in Perl have a | ||||
| 567 | (loose, not strict) UTF-8 representation, this function cannot fail. | ||||
| 568 | |||||
| 569 | =head3 decode_utf8 | ||||
| 570 | |||||
| 571 | $string = decode_utf8($octets [, CHECK]); | ||||
| 572 | |||||
| 573 | Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. | ||||
| 574 | The sequence of octets represented by $octets is decoded | ||||
| 575 | from UTF-8 into a sequence of logical characters. | ||||
| 576 | Because not all sequences of octets are valid UTF-8, | ||||
| 577 | it is quite possible for this function to fail. | ||||
| 578 | For CHECK, see L</"Handling Malformed Data">. | ||||
| 579 | |||||
| 580 | =head2 Listing available encodings | ||||
| 581 | |||||
| 582 | use Encode; | ||||
| 583 | @list = Encode->encodings(); | ||||
| 584 | |||||
| 585 | Returns a list of canonical names of available encodings that have already | ||||
| 586 | been loaded. To get a list of all available encodings including those that | ||||
| 587 | have not yet been loaded, say: | ||||
| 588 | |||||
| 589 | @all_encodings = Encode->encodings(":all"); | ||||
| 590 | |||||
| 591 | Or you can give the name of a specific module: | ||||
| 592 | |||||
| 593 | @with_jp = Encode->encodings("Encode::JP"); | ||||
| 594 | |||||
| 595 | When "C<::>" is not in the name, "C<Encode::>" is assumed. | ||||
| 596 | |||||
| 597 | @ebcdic = Encode->encodings("EBCDIC"); | ||||
| 598 | |||||
| 599 | To find out in detail which encodings are supported by this package, | ||||
| 600 | see L<Encode::Supported>. | ||||
| 601 | |||||
| 602 | =head2 Defining Aliases | ||||
| 603 | |||||
| 604 | To add a new alias to a given encoding, use: | ||||
| 605 | |||||
| 606 | use Encode; | ||||
| 607 | use Encode::Alias; | ||||
| 608 | define_alias(NEWNAME => ENCODING); | ||||
| 609 | |||||
| 610 | After that, I<NEWNAME> can be used as an alias for I<ENCODING>. | ||||
| 611 | I<ENCODING> may be either the name of an encoding or an | ||||
| 612 | I<encoding object>. | ||||
| 613 | |||||
| 614 | Before you do that, first make sure the alias is nonexistent using | ||||
| 615 | C<resolve_alias()>, which returns the canonical name thereof. | ||||
| 616 | For example: | ||||
| 617 | |||||
| 618 | Encode::resolve_alias("latin1") eq "iso-8859-1" # true | ||||
| 619 | Encode::resolve_alias("iso-8859-12") # false; nonexistent | ||||
| 620 | Encode::resolve_alias($name) eq $name # true if $name is canonical | ||||
| 621 | |||||
| 622 | C<resolve_alias()> does not need C<use Encode::Alias>; it can be | ||||
| 623 | imported via C<use Encode qw(resolve_alias)>. | ||||
| 624 | |||||
| 625 | See L<Encode::Alias> for details. | ||||
| 626 | |||||
| 627 | =head2 Finding IANA Character Set Registry names | ||||
| 628 | |||||
| 629 | The canonical name of a given encoding does not necessarily agree with | ||||
| 630 | IANA Character Set Registry, commonly seen as C<< Content-Type: | ||||
| 631 | text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name | ||||
| 632 | works, but sometimes it does not, most notably with "utf-8-strict". | ||||
| 633 | |||||
| 634 | As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. | ||||
| 635 | |||||
| 636 | use Encode; | ||||
| 637 | my $enc = find_encoding("UTF-8"); | ||||
| 638 | warn $enc->name; # utf-8-strict | ||||
| 639 | warn $enc->mime_name; # UTF-8 | ||||
| 640 | |||||
| 641 | See also: L<Encode::Encoding> | ||||
| 642 | |||||
| 643 | =head1 Encoding via PerlIO | ||||
| 644 | |||||
| 645 | If your perl supports C<PerlIO> (which is the default), you can use a | ||||
| 646 | C<PerlIO> layer to decode and encode directly via a filehandle. The | ||||
| 647 | following two examples are fully identical in functionality: | ||||
| 648 | |||||
| 649 | ### Version 1 via PerlIO | ||||
| 650 | open(INPUT, "< :encoding(shiftjis)", $infile) | ||||
| 651 | || die "Can't open < $infile for reading: $!"; | ||||
| 652 | open(OUTPUT, "> :encoding(euc-jp)", $outfile) | ||||
| 653 | || die "Can't open > $output for writing: $!"; | ||||
| 654 | while (<INPUT>) { # auto decodes $_ | ||||
| 655 | print OUTPUT; # auto encodes $_ | ||||
| 656 | } | ||||
| 657 | close(INPUT) || die "can't close $infile: $!"; | ||||
| 658 | close(OUTPUT) || die "can't close $outfile: $!"; | ||||
| 659 | |||||
| 660 | ### Version 2 via from_to() | ||||
| 661 | open(INPUT, "< :raw", $infile) | ||||
| 662 | || die "Can't open < $infile for reading: $!"; | ||||
| 663 | open(OUTPUT, "> :raw", $outfile) | ||||
| 664 | || die "Can't open > $output for writing: $!"; | ||||
| 665 | |||||
| 666 | while (<INPUT>) { | ||||
| 667 | from_to($_, "shiftjis", "euc-jp", 1); # switch encoding | ||||
| 668 | print OUTPUT; # emit raw (but properly encoded) data | ||||
| 669 | } | ||||
| 670 | close(INPUT) || die "can't close $infile: $!"; | ||||
| 671 | close(OUTPUT) || die "can't close $outfile: $!"; | ||||
| 672 | |||||
| 673 | In the first version above, you let the appropriate encoding layer | ||||
| 674 | handle the conversion. In the second, you explicitly translate | ||||
| 675 | from one encoding to the other. | ||||
| 676 | |||||
| 677 | Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check | ||||
| 678 | to see whether your encoding is supported by C<PerlIO> by invoking the | ||||
| 679 | C<perlio_ok> method on it: | ||||
| 680 | |||||
| 681 | Encode::perlio_ok("hz"); # false | ||||
| 682 | find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available | ||||
| 683 | |||||
| 684 | use Encode qw(perlio_ok); # imported upon request | ||||
| 685 | perlio_ok("euc-jp") | ||||
| 686 | |||||
| 687 | Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy | ||||
| 688 | except for C<hz> and C<ISO-2022-kr>. For the gory details, see | ||||
| 689 | L<Encode::Encoding> and L<Encode::PerlIO>. | ||||
| 690 | |||||
| 691 | =head1 Handling Malformed Data | ||||
| 692 | |||||
| 693 | The optional I<CHECK> argument tells C<Encode> what to do when | ||||
| 694 | encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> | ||||
| 695 | (== 0) is assumed. | ||||
| 696 | |||||
| 697 | As of version 2.12, C<Encode> supports coderef values for C<CHECK>; | ||||
| 698 | see below. | ||||
| 699 | |||||
| 700 | B<NOTE:> Not all encodings support this feature. | ||||
| 701 | Some encodings ignore the I<CHECK> argument. For example, | ||||
| 702 | L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. | ||||
| 703 | |||||
| 704 | =head2 List of I<CHECK> values | ||||
| 705 | |||||
| 706 | =head3 FB_DEFAULT | ||||
| 707 | |||||
| 708 | I<CHECK> = Encode::FB_DEFAULT ( == 0) | ||||
| 709 | |||||
| 710 | If I<CHECK> is 0, encoding and decoding replace any malformed character | ||||
| 711 | with a I<substitution character>. When you encode, I<SUBCHAR> is used. | ||||
| 712 | When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is | ||||
| 713 | used. If the data is supposed to be UTF-8, an optional lexical warning of | ||||
| 714 | warning category C<"utf8"> is given. | ||||
| 715 | |||||
| 716 | =head3 FB_CROAK | ||||
| 717 | |||||
| 718 | I<CHECK> = Encode::FB_CROAK ( == 1) | ||||
| 719 | |||||
| 720 | If I<CHECK> is 1, methods immediately die with an error | ||||
| 721 | message. Therefore, when I<CHECK> is 1, you should trap | ||||
| 722 | exceptions with C<eval{}>, unless you really want to let it C<die>. | ||||
| 723 | |||||
| 724 | =head3 FB_QUIET | ||||
| 725 | |||||
| 726 | I<CHECK> = Encode::FB_QUIET | ||||
| 727 | |||||
| 728 | If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately | ||||
| 729 | return the portion of the data that has been processed so far when an | ||||
| 730 | error occurs. The data argument is overwritten with everything | ||||
| 731 | after that point; that is, the unprocessed portion of the data. This is | ||||
| 732 | handy when you have to call C<decode> repeatedly in the case where your | ||||
| 733 | source data may contain partial multi-byte character sequences, | ||||
| 734 | (that is, you are reading with a fixed-width buffer). Here's some sample | ||||
| 735 | code to do exactly that: | ||||
| 736 | |||||
| 737 | my($buffer, $string) = ("", ""); | ||||
| 738 | while (read($fh, $buffer, 256, length($buffer))) { | ||||
| 739 | $string .= decode($encoding, $buffer, Encode::FB_QUIET); | ||||
| 740 | # $buffer now contains the unprocessed partial character | ||||
| 741 | } | ||||
| 742 | |||||
| 743 | =head3 FB_WARN | ||||
| 744 | |||||
| 745 | I<CHECK> = Encode::FB_WARN | ||||
| 746 | |||||
| 747 | This is the same as C<FB_QUIET> above, except that instead of being silent | ||||
| 748 | on errors, it issues a warning. This is handy for when you are debugging. | ||||
| 749 | |||||
| 750 | =head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF | ||||
| 751 | |||||
| 752 | =over 2 | ||||
| 753 | |||||
| 754 | =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) | ||||
| 755 | |||||
| 756 | =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) | ||||
| 757 | |||||
| 758 | =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) | ||||
| 759 | |||||
| 760 | =back | ||||
| 761 | |||||
| 762 | For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> | ||||
| 763 | C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. | ||||
| 764 | |||||
| 765 | When you decode, C<\xI<HH>> is inserted for a malformed character, where | ||||
| 766 | I<HH> is the hex representation of the octet that could not be decoded to | ||||
| 767 | utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is | ||||
| 768 | the Unicode code point (in any number of hex digits) of the character that | ||||
| 769 | cannot be found in the character repertoire of the encoding. | ||||
| 770 | |||||
| 771 | The HTML/XML character reference modes are about the same. In place of | ||||
| 772 | C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and | ||||
| 773 | XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. | ||||
| 774 | |||||
| 775 | In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. | ||||
| 776 | |||||
| 777 | =head3 The bitmask | ||||
| 778 | |||||
| 779 | These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> | ||||
| 780 | constants are laid out. You can import the C<FB_I<XXX>> constants via | ||||
| 781 | C<use Encode qw(:fallbacks)>, and you can import the generic bitmask | ||||
| 782 | constants via C<use Encode qw(:fallback_all)>. | ||||
| 783 | |||||
| 784 | FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ | ||||
| 785 | DIE_ON_ERR 0x0001 X | ||||
| 786 | WARN_ON_ERR 0x0002 X | ||||
| 787 | RETURN_ON_ERR 0x0004 X X | ||||
| 788 | LEAVE_SRC 0x0008 X | ||||
| 789 | PERLQQ 0x0100 X | ||||
| 790 | HTMLCREF 0x0200 | ||||
| 791 | XMLCREF 0x0400 | ||||
| 792 | |||||
| 793 | =head3 LEAVE_SRC | ||||
| 794 | |||||
| 795 | Encode::LEAVE_SRC | ||||
| 796 | |||||
| 797 | If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the | ||||
| 798 | source string to encode() or decode() will be overwritten in place. | ||||
| 799 | If you're not interested in this, then bitwise-OR it with the bitmask. | ||||
| 800 | |||||
| 801 | =head2 coderef for CHECK | ||||
| 802 | |||||
| 803 | As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the | ||||
| 804 | ordinal value of the unmapped character as an argument and returns | ||||
| 805 | octets that represent the fallback character. For instance: | ||||
| 806 | |||||
| 807 | $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); | ||||
| 808 | |||||
| 809 | Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. | ||||
| 810 | |||||
| 811 | Even the fallback for C<decode> must return octets, which are | ||||
| 812 | then decoded with the character encoding that C<decode> accepts. So for | ||||
| 813 | example if you wish to decode octests as UTF-8, and use ISO-8859-15 as | ||||
| 814 | a fallback for bytes that are not valid UTF-8, you could write | ||||
| 815 | |||||
| 816 | $str = decode 'UTF-8', $octets, sub { | ||||
| 817 | my $tmp = chr shift; | ||||
| 818 | from_to $tmp, 'ISO-8859-15', 'UTF-8'; | ||||
| 819 | return $tmp; | ||||
| 820 | }; | ||||
| 821 | |||||
| 822 | =head1 Defining Encodings | ||||
| 823 | |||||
| 824 | To define a new encoding, use: | ||||
| 825 | |||||
| 826 | use Encode qw(define_encoding); | ||||
| 827 | define_encoding($object, CANONICAL_NAME [, alias...]); | ||||
| 828 | |||||
| 829 | I<CANONICAL_NAME> will be associated with I<$object>. The object | ||||
| 830 | should provide the interface described in L<Encode::Encoding>. | ||||
| 831 | If more than two arguments are provided, additional | ||||
| 832 | arguments are considered aliases for I<$object>. | ||||
| 833 | |||||
| 834 | See L<Encode::Encoding> for details. | ||||
| 835 | |||||
| 836 | =head1 The UTF8 flag | ||||
| 837 | |||||
| 838 | Before the introduction of Unicode support in Perl, The C<eq> operator | ||||
| 839 | just compared the strings represented by two scalars. Beginning with | ||||
| 840 | Perl 5.8, C<eq> compares two strings with simultaneous consideration of | ||||
| 841 | I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of | ||||
| 842 | I<Programming Perl, 3rd ed.> | ||||
| 843 | |||||
| 844 | =over 2 | ||||
| 845 | |||||
| 846 | =item Goal #1: | ||||
| 847 | |||||
| 848 | Old byte-oriented programs should not spontaneously break on the old | ||||
| 849 | byte-oriented data they used to work on. | ||||
| 850 | |||||
| 851 | =item Goal #2: | ||||
| 852 | |||||
| 853 | Old byte-oriented programs should magically start working on the new | ||||
| 854 | character-oriented data when appropriate. | ||||
| 855 | |||||
| 856 | =item Goal #3: | ||||
| 857 | |||||
| 858 | Programs should run just as fast in the new character-oriented mode | ||||
| 859 | as in the old byte-oriented mode. | ||||
| 860 | |||||
| 861 | =item Goal #4: | ||||
| 862 | |||||
| 863 | Perl should remain one language, rather than forking into a | ||||
| 864 | byte-oriented Perl and a character-oriented Perl. | ||||
| 865 | |||||
| 866 | =back | ||||
| 867 | |||||
| 868 | When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been | ||||
| 869 | born yet, many features documented in the book remained unimplemented for a | ||||
| 870 | long time. Perl 5.8 corrected much of this, and the introduction of the | ||||
| 871 | UTF8 flag is one of them. You can think of there being two fundamentally | ||||
| 872 | different kinds of strings and string-operations in Perl: one a | ||||
| 873 | byte-oriented mode for when the internal UTF8 flag is off, and the other a | ||||
| 874 | character-oriented mode for when the internal UTF8 flag is on. | ||||
| 875 | |||||
| 876 | Here is how C<Encode> handles the UTF8 flag. | ||||
| 877 | |||||
| 878 | =over 2 | ||||
| 879 | |||||
| 880 | =item * | ||||
| 881 | |||||
| 882 | When you I<encode>, the resulting UTF8 flag is always B<off>. | ||||
| 883 | |||||
| 884 | =item * | ||||
| 885 | |||||
| 886 | When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can | ||||
| 887 | unambiguously represent data. Here is what we mean by "unambiguously". | ||||
| 888 | After C<$utf8 = decode("foo", $octet)>, | ||||
| 889 | |||||
| 890 | When $octet is... The UTF8 flag in $utf8 is | ||||
| 891 | --------------------------------------------- | ||||
| 892 | In ASCII only (or EBCDIC only) OFF | ||||
| 893 | In ISO-8859-1 ON | ||||
| 894 | In any other Encoding ON | ||||
| 895 | --------------------------------------------- | ||||
| 896 | |||||
| 897 | As you see, there is one exception: in ASCII. That way you can assume | ||||
| 898 | Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be | ||||
| 899 | careful in the cases mentioned in the B<CAVEAT> paragraphs above. | ||||
| 900 | |||||
| 901 | This UTF8 flag is not visible in Perl scripts, exactly for the same reason | ||||
| 902 | you cannot (or rather, you I<don't have to>) see whether a scalar contains | ||||
| 903 | a string, an integer, or a floating-point number. But you can still peek | ||||
| 904 | and poke these if you will. See the next section. | ||||
| 905 | |||||
| 906 | =back | ||||
| 907 | |||||
| 908 | =head2 Messing with Perl's Internals | ||||
| 909 | |||||
| 910 | The following API uses parts of Perl's internals in the current | ||||
| 911 | implementation. As such, they are efficient but may change in a future | ||||
| 912 | release. | ||||
| 913 | |||||
| 914 | =head3 is_utf8 | ||||
| 915 | |||||
| 916 | is_utf8(STRING [, CHECK]) | ||||
| 917 | |||||
| 918 | [INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. | ||||
| 919 | If I<CHECK> is true, also checks whether I<STRING> contains well-formed | ||||
| 920 | UTF-8. Returns true if successful, false otherwise. | ||||
| 921 | |||||
| 922 | As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. | ||||
| 923 | |||||
| 924 | =head3 _utf8_on | ||||
| 925 | |||||
| 926 | _utf8_on(STRING) | ||||
| 927 | |||||
| 928 | [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> | ||||
| 929 | is I<not> checked for containing only well-formed UTF-8. Do not use this | ||||
| 930 | unless you I<know with absolute certainty> that the STRING holds only | ||||
| 931 | well-formed UTF-8. Returns the previous state of the UTF8 flag (so please | ||||
| 932 | don't treat the return value as indicating success or failure), or C<undef> | ||||
| 933 | if I<STRING> is not a string. | ||||
| 934 | |||||
| 935 | B<NOTE>: For security reasons, this function does not work on tainted values. | ||||
| 936 | |||||
| 937 | =head3 _utf8_off | ||||
| 938 | |||||
| 939 | _utf8_off(STRING) | ||||
| 940 | |||||
| 941 | [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use | ||||
| 942 | frivolously. Returns the previous state of the UTF8 flag, or C<undef> if | ||||
| 943 | I<STRING> is not a string. Do not treat the return value as indicative of | ||||
| 944 | success or failure, because that isn't what it means: it is only the | ||||
| 945 | previous setting. | ||||
| 946 | |||||
| 947 | B<NOTE>: For security reasons, this function does not work on tainted values. | ||||
| 948 | |||||
| 949 | =head1 UTF-8 vs. utf8 vs. UTF8 | ||||
| 950 | |||||
| 951 | ....We now view strings not as sequences of bytes, but as sequences | ||||
| 952 | of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit | ||||
| 953 | computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. | ||||
| 954 | |||||
| 955 | That has historically been Perl's notion of UTF-8, as that is how UTF-8 was | ||||
| 956 | first conceived by Ken Thompson when he invented it. However, thanks to | ||||
| 957 | later revisions to the applicable standards, official UTF-8 is now rather | ||||
| 958 | stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF | ||||
| 959 | to cover only 21 bits instead of 32 or 64 bits) and some sequences | ||||
| 960 | are not allowed, like those used in surrogate pairs, the 31 non-character | ||||
| 961 | code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane | ||||
| 962 | (0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. | ||||
| 963 | |||||
| 964 | The former default in which Perl would always use a loose interpretation of | ||||
| 965 | UTF-8 has now been overruled: | ||||
| 966 | |||||
| 967 | From: Larry Wall <larry@wall.org> | ||||
| 968 | Date: December 04, 2004 11:51:58 JST | ||||
| 969 | To: perl-unicode@perl.org | ||||
| 970 | Subject: Re: Make Encode.pm support the real UTF-8 | ||||
| 971 | Message-Id: <20041204025158.GA28754@wall.org> | ||||
| 972 | |||||
| 973 | On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: | ||||
| 974 | : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, | ||||
| 975 | : but "UTF-8" is the name of the standard and should give the | ||||
| 976 | : corresponding behaviour. | ||||
| 977 | |||||
| 978 | For what it's worth, that's how I've always kept them straight in my | ||||
| 979 | head. | ||||
| 980 | |||||
| 981 | Also for what it's worth, Perl 6 will mostly default to strict but | ||||
| 982 | make it easy to switch back to lax. | ||||
| 983 | |||||
| 984 | Larry | ||||
| 985 | |||||
| 986 | Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current | ||||
| 987 | sense, which is conservative and strict and security-conscious, whereas | ||||
| 988 | B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and | ||||
| 989 | lax. C<Encode> version 2.10 or later thus groks this subtle but critically | ||||
| 990 | important distinction between C<"UTF-8"> and C<"utf8">. | ||||
| 991 | |||||
| 992 | encode("utf8", "\x{FFFF_FFFF}", 1); # okay | ||||
| 993 | encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks | ||||
| 994 | |||||
| 995 | In the C<Encode> module, C<"UTF-8"> is actually a canonical name for | ||||
| 996 | C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is | ||||
| 997 | critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: | ||||
| 998 | |||||
| 999 | find_encoding("UTF-8")->name # is 'utf-8-strict' | ||||
| 1000 | find_encoding("utf-8")->name # ditto. names are case insensitive | ||||
| 1001 | find_encoding("utf_8")->name # ditto. "_" are treated as "-" | ||||
| 1002 | find_encoding("UTF8")->name # is 'utf8'. | ||||
| 1003 | |||||
| 1004 | Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates | ||||
| 1005 | whether a string is internally encoded as "utf8", also without a hyphen. | ||||
| 1006 | |||||
| 1007 | =head1 SEE ALSO | ||||
| 1008 | |||||
| 1009 | L<Encode::Encoding>, | ||||
| 1010 | L<Encode::Supported>, | ||||
| 1011 | L<Encode::PerlIO>, | ||||
| 1012 | L<encoding>, | ||||
| 1013 | L<perlebcdic>, | ||||
| 1014 | L<perlfunc/open>, | ||||
| 1015 | L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> | ||||
| 1016 | L<utf8>, | ||||
| 1017 | the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> | ||||
| 1018 | |||||
| 1019 | =head1 MAINTAINER | ||||
| 1020 | |||||
| 1021 | This project was originated by the late Nick Ing-Simmons and later | ||||
| 1022 | maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS | ||||
| 1023 | for a full list of people involved. For any questions, send mail to | ||||
| 1024 | I<< <perl-unicode@perl.org> >> so that we can all share. | ||||
| 1025 | |||||
| 1026 | While Dan Kogai retains the copyright as a maintainer, credit | ||||
| 1027 | should go to all those involved. See AUTHORS for a list of those | ||||
| 1028 | who submitted code to the project. | ||||
| 1029 | |||||
| 1030 | =head1 COPYRIGHT | ||||
| 1031 | |||||
| 1032 | Copyright 2002-2013 Dan Kogai I<< <dankogai@cpan.org> >>. | ||||
| 1033 | |||||
| 1034 | This library is free software; you can redistribute it and/or modify | ||||
| 1035 | it under the same terms as Perl itself. | ||||
| 1036 | |||||
| 1037 | =cut |