Filename | /usr/lib/x86_64-linux-gnu/perl/5.20/Encode.pm |
Statements | Executed 0 statements in 0s |
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # | ||||
2 | # $Id: Encode.pm,v 2.60 2014/04/29 16:26:49 dankogai Exp dankogai $ | ||||
3 | # | ||||
4 | package Encode; | ||||
5 | use strict; | ||||
6 | use warnings; | ||||
7 | our $VERSION = sprintf "%d.%02d", q$Revision: 2.60_01 $ =~ /(\d+)/g; | ||||
8 | use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; | ||||
9 | use XSLoader (); | ||||
10 | XSLoader::load( __PACKAGE__, $VERSION ); | ||||
11 | |||||
12 | use Exporter 5.57 'import'; | ||||
13 | |||||
14 | # Public, encouraged API is exported by default | ||||
15 | |||||
16 | our @EXPORT = qw( | ||||
17 | decode decode_utf8 encode encode_utf8 str2bytes bytes2str | ||||
18 | encodings find_encoding clone_encoding | ||||
19 | ); | ||||
20 | our @FB_FLAGS = qw( | ||||
21 | DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC | ||||
22 | PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL | ||||
23 | ); | ||||
24 | our @FB_CONSTS = qw( | ||||
25 | FB_DEFAULT FB_CROAK FB_QUIET FB_WARN | ||||
26 | FB_PERLQQ FB_HTMLCREF FB_XMLCREF | ||||
27 | ); | ||||
28 | our @EXPORT_OK = ( | ||||
29 | qw( | ||||
30 | _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit | ||||
31 | is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade | ||||
32 | ), | ||||
33 | @FB_FLAGS, @FB_CONSTS, | ||||
34 | ); | ||||
35 | |||||
36 | our %EXPORT_TAGS = ( | ||||
37 | all => [ @EXPORT, @EXPORT_OK ], | ||||
38 | default => [ @EXPORT ], | ||||
39 | fallbacks => [ @FB_CONSTS ], | ||||
40 | fallback_all => [ @FB_CONSTS, @FB_FLAGS ], | ||||
41 | ); | ||||
42 | |||||
43 | # Documentation moved after __END__ for speed - NI-S | ||||
44 | |||||
45 | our $ON_EBCDIC = ( ord("A") == 193 ); | ||||
46 | |||||
47 | use Encode::Alias; | ||||
48 | |||||
49 | # Make a %Encoding package variable to allow a certain amount of cheating | ||||
50 | our %Encoding; | ||||
51 | our %ExtModule; | ||||
52 | require Encode::Config; | ||||
53 | # See | ||||
54 | # https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 | ||||
55 | # to find why sig handlers inside eval{} are disabled. | ||||
56 | eval { | ||||
57 | local $SIG{__DIE__}; | ||||
58 | local $SIG{__WARN__}; | ||||
59 | require Encode::ConfigLocal; | ||||
60 | }; | ||||
61 | |||||
62 | sub encodings { | ||||
63 | my %enc; | ||||
64 | my $arg = $_[1] || ''; | ||||
65 | if ( $arg eq ":all" ) { | ||||
66 | %enc = ( %Encoding, %ExtModule ); | ||||
67 | } | ||||
68 | else { | ||||
69 | %enc = %Encoding; | ||||
70 | for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { | ||||
71 | DEBUG and warn $mod; | ||||
72 | for my $enc ( keys %ExtModule ) { | ||||
73 | $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; | ||||
74 | } | ||||
75 | } | ||||
76 | } | ||||
77 | return sort { lc $a cmp lc $b } | ||||
78 | grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; | ||||
79 | } | ||||
80 | |||||
81 | sub perlio_ok { | ||||
82 | my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); | ||||
83 | $obj->can("perlio_ok") and return $obj->perlio_ok(); | ||||
84 | return 0; # safety net | ||||
85 | } | ||||
86 | |||||
87 | sub define_encoding { | ||||
88 | my $obj = shift; | ||||
89 | my $name = shift; | ||||
90 | $Encoding{$name} = $obj; | ||||
91 | my $lc = lc($name); | ||||
92 | define_alias( $lc => $obj ) unless $lc eq $name; | ||||
93 | while (@_) { | ||||
94 | my $alias = shift; | ||||
95 | define_alias( $alias, $obj ); | ||||
96 | } | ||||
97 | return $obj; | ||||
98 | } | ||||
99 | |||||
100 | sub getEncoding { | ||||
101 | my ( $class, $name, $skip_external ) = @_; | ||||
102 | |||||
103 | $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 | ||||
104 | |||||
105 | ref($name) && $name->can('renew') and return $name; | ||||
106 | exists $Encoding{$name} and return $Encoding{$name}; | ||||
107 | my $lc = lc $name; | ||||
108 | exists $Encoding{$lc} and return $Encoding{$lc}; | ||||
109 | |||||
110 | my $oc = $class->find_alias($name); | ||||
111 | defined($oc) and return $oc; | ||||
112 | $lc ne $name and $oc = $class->find_alias($lc); | ||||
113 | defined($oc) and return $oc; | ||||
114 | |||||
115 | unless ($skip_external) { | ||||
116 | if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { | ||||
117 | $mod =~ s,::,/,g; | ||||
118 | $mod .= '.pm'; | ||||
119 | eval { require $mod; }; | ||||
120 | exists $Encoding{$name} and return $Encoding{$name}; | ||||
121 | } | ||||
122 | } | ||||
123 | return; | ||||
124 | } | ||||
125 | |||||
126 | sub find_encoding($;$) { | ||||
127 | my ( $name, $skip_external ) = @_; | ||||
128 | return __PACKAGE__->getEncoding( $name, $skip_external ); | ||||
129 | } | ||||
130 | |||||
131 | sub resolve_alias($) { | ||||
132 | my $obj = find_encoding(shift); | ||||
133 | defined $obj and return $obj->name; | ||||
134 | return; | ||||
135 | } | ||||
136 | |||||
137 | sub clone_encoding($) { | ||||
138 | my $obj = find_encoding(shift); | ||||
139 | ref $obj or return; | ||||
140 | eval { require Storable }; | ||||
141 | $@ and return; | ||||
142 | return Storable::dclone($obj); | ||||
143 | } | ||||
144 | |||||
145 | sub encode($$;$) { | ||||
146 | my ( $name, $string, $check ) = @_; | ||||
147 | return undef unless defined $string; | ||||
148 | $string .= ''; # stringify; | ||||
149 | $check ||= 0; | ||||
150 | unless ( defined $name ) { | ||||
151 | require Carp; | ||||
152 | Carp::croak("Encoding name should not be undef"); | ||||
153 | } | ||||
154 | my $enc = find_encoding($name); | ||||
155 | unless ( defined $enc ) { | ||||
156 | require Carp; | ||||
157 | Carp::croak("Unknown encoding '$name'"); | ||||
158 | } | ||||
159 | my $octets = $enc->encode( $string, $check ); | ||||
160 | $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); | ||||
161 | return $octets; | ||||
162 | } | ||||
163 | *str2bytes = \&encode; | ||||
164 | |||||
165 | sub decode($$;$) { | ||||
166 | my ( $name, $octets, $check ) = @_; | ||||
167 | return undef unless defined $octets; | ||||
168 | $octets .= ''; | ||||
169 | $check ||= 0; | ||||
170 | my $enc = find_encoding($name); | ||||
171 | unless ( defined $enc ) { | ||||
172 | require Carp; | ||||
173 | Carp::croak("Unknown encoding '$name'"); | ||||
174 | } | ||||
175 | 20 | 46µs | my $string = $enc->decode( $octets, $check ); # spent 46µs making 20 calls to Encode::Encoding::renewed, avg 2µs/call | ||
176 | $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); | ||||
177 | return $string; | ||||
178 | } | ||||
179 | *bytes2str = \&decode; | ||||
180 | |||||
181 | sub from_to($$$;$) { | ||||
182 | my ( $string, $from, $to, $check ) = @_; | ||||
183 | return undef unless defined $string; | ||||
184 | $check ||= 0; | ||||
185 | my $f = find_encoding($from); | ||||
186 | unless ( defined $f ) { | ||||
187 | require Carp; | ||||
188 | Carp::croak("Unknown encoding '$from'"); | ||||
189 | } | ||||
190 | my $t = find_encoding($to); | ||||
191 | unless ( defined $t ) { | ||||
192 | require Carp; | ||||
193 | Carp::croak("Unknown encoding '$to'"); | ||||
194 | } | ||||
195 | my $uni = $f->decode($string); | ||||
196 | $_[0] = $string = $t->encode( $uni, $check ); | ||||
197 | return undef if ( $check && length($uni) ); | ||||
198 | return defined( $_[0] ) ? length($string) : undef; | ||||
199 | } | ||||
200 | |||||
201 | sub encode_utf8($) { | ||||
202 | my ($str) = @_; | ||||
203 | utf8::encode($str); | ||||
204 | return $str; | ||||
205 | } | ||||
206 | |||||
207 | my $utf8enc; | ||||
208 | |||||
209 | sub decode_utf8($;$) { | ||||
210 | my ( $octets, $check ) = @_; | ||||
211 | return undef unless defined $octets; | ||||
212 | $octets .= ''; | ||||
213 | $check ||= 0; | ||||
214 | $utf8enc ||= find_encoding('utf8'); | ||||
215 | my $string = $utf8enc->decode( $octets, $check ); | ||||
216 | $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); | ||||
217 | return $string; | ||||
218 | } | ||||
219 | |||||
220 | # sub decode_utf8($;$) { | ||||
221 | # my ( $str, $check ) = @_; | ||||
222 | # return $str if is_utf8($str); | ||||
223 | # if ($check) { | ||||
224 | # return decode( "utf8", $str, $check ); | ||||
225 | # } | ||||
226 | # else { | ||||
227 | # return decode( "utf8", $str ); | ||||
228 | # return $str; | ||||
229 | # } | ||||
230 | # } | ||||
231 | |||||
232 | predefine_encodings(1); | ||||
233 | |||||
234 | # | ||||
235 | # This is to restore %Encoding if really needed; | ||||
236 | # | ||||
237 | |||||
238 | sub predefine_encodings { | ||||
239 | require Encode::Encoding; | ||||
240 | no warnings 'redefine'; | ||||
241 | my $use_xs = shift; | ||||
242 | if ($ON_EBCDIC) { | ||||
243 | |||||
244 | # was in Encode::UTF_EBCDIC | ||||
245 | package Encode::UTF_EBCDIC; | ||||
246 | push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; | ||||
247 | *decode = sub { | ||||
248 | my ( undef, $str, $chk ) = @_; | ||||
249 | my $res = ''; | ||||
250 | for ( my $i = 0 ; $i < length($str) ; $i++ ) { | ||||
251 | $res .= | ||||
252 | chr( | ||||
253 | utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) | ||||
254 | ); | ||||
255 | } | ||||
256 | $_[1] = '' if $chk; | ||||
257 | return $res; | ||||
258 | }; | ||||
259 | *encode = sub { | ||||
260 | my ( undef, $str, $chk ) = @_; | ||||
261 | my $res = ''; | ||||
262 | for ( my $i = 0 ; $i < length($str) ; $i++ ) { | ||||
263 | $res .= | ||||
264 | chr( | ||||
265 | utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) | ||||
266 | ); | ||||
267 | } | ||||
268 | $_[1] = '' if $chk; | ||||
269 | return $res; | ||||
270 | }; | ||||
271 | $Encode::Encoding{Unicode} = | ||||
272 | bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; | ||||
273 | } | ||||
274 | else { | ||||
275 | |||||
276 | package Encode::Internal; | ||||
277 | push @Encode::Internal::ISA, 'Encode::Encoding'; | ||||
278 | *decode = sub { | ||||
279 | my ( undef, $str, $chk ) = @_; | ||||
280 | utf8::upgrade($str); | ||||
281 | $_[1] = '' if $chk; | ||||
282 | return $str; | ||||
283 | }; | ||||
284 | *encode = \&decode; | ||||
285 | $Encode::Encoding{Unicode} = | ||||
286 | bless { Name => "Internal" } => "Encode::Internal"; | ||||
287 | } | ||||
288 | |||||
289 | { | ||||
290 | |||||
291 | # was in Encode::utf8 | ||||
292 | package Encode::utf8; | ||||
293 | push @Encode::utf8::ISA, 'Encode::Encoding'; | ||||
294 | |||||
295 | # | ||||
296 | if ($use_xs) { | ||||
297 | Encode::DEBUG and warn __PACKAGE__, " XS on"; | ||||
298 | *decode = \&decode_xs; | ||||
299 | *encode = \&encode_xs; | ||||
300 | } | ||||
301 | else { | ||||
302 | Encode::DEBUG and warn __PACKAGE__, " XS off"; | ||||
303 | *decode = sub { | ||||
304 | my ( undef, $octets, $chk ) = @_; | ||||
305 | my $str = Encode::decode_utf8($octets); | ||||
306 | if ( defined $str ) { | ||||
307 | $_[1] = '' if $chk; | ||||
308 | return $str; | ||||
309 | } | ||||
310 | return undef; | ||||
311 | }; | ||||
312 | *encode = sub { | ||||
313 | my ( undef, $string, $chk ) = @_; | ||||
314 | my $octets = Encode::encode_utf8($string); | ||||
315 | $_[1] = '' if $chk; | ||||
316 | return $octets; | ||||
317 | }; | ||||
318 | } | ||||
319 | *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) | ||||
320 | # currently ignores $chk | ||||
321 | my ( undef, undef, undef, $pos, $trm ) = @_; | ||||
322 | my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; | ||||
323 | use bytes; | ||||
324 | if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { | ||||
325 | $$rdst .= | ||||
326 | substr( $$rsrc, $pos, $npos - $pos + length($trm) ); | ||||
327 | $$rpos = $npos + length($trm); | ||||
328 | return 1; | ||||
329 | } | ||||
330 | $$rdst .= substr( $$rsrc, $pos ); | ||||
331 | $$rpos = length($$rsrc); | ||||
332 | return ''; | ||||
333 | }; | ||||
334 | $Encode::Encoding{utf8} = | ||||
335 | bless { Name => "utf8" } => "Encode::utf8"; | ||||
336 | $Encode::Encoding{"utf-8-strict"} = | ||||
337 | bless { Name => "utf-8-strict", strict_utf8 => 1 } | ||||
338 | => "Encode::utf8"; | ||||
339 | } | ||||
340 | } | ||||
341 | |||||
342 | 1; | ||||
343 | |||||
344 | __END__ | ||||
345 | |||||
346 | =head1 NAME | ||||
347 | |||||
348 | Encode - character encodings in Perl | ||||
349 | |||||
350 | =head1 SYNOPSIS | ||||
351 | |||||
352 | use Encode qw(decode encode); | ||||
353 | $characters = decode('UTF-8', $octets, Encode::FB_CROAK); | ||||
354 | $octets = encode('UTF-8', $characters, Encode::FB_CROAK); | ||||
355 | |||||
356 | =head2 Table of Contents | ||||
357 | |||||
358 | Encode consists of a collection of modules whose details are too extensive | ||||
359 | to fit in one document. This one itself explains the top-level APIs | ||||
360 | and general topics at a glance. For other topics and more details, | ||||
361 | see the documentation for these modules: | ||||
362 | |||||
363 | =over 2 | ||||
364 | |||||
365 | =item L<Encode::Alias> - Alias definitions to encodings | ||||
366 | |||||
367 | =item L<Encode::Encoding> - Encode Implementation Base Class | ||||
368 | |||||
369 | =item L<Encode::Supported> - List of Supported Encodings | ||||
370 | |||||
371 | =item L<Encode::CN> - Simplified Chinese Encodings | ||||
372 | |||||
373 | =item L<Encode::JP> - Japanese Encodings | ||||
374 | |||||
375 | =item L<Encode::KR> - Korean Encodings | ||||
376 | |||||
377 | =item L<Encode::TW> - Traditional Chinese Encodings | ||||
378 | |||||
379 | =back | ||||
380 | |||||
381 | =head1 DESCRIPTION | ||||
382 | |||||
383 | The C<Encode> module provides the interface between Perl strings | ||||
384 | and the rest of the system. Perl strings are sequences of | ||||
385 | I<characters>. | ||||
386 | |||||
387 | The repertoire of characters that Perl can represent is a superset of those | ||||
388 | defined by the Unicode Consortium. On most platforms the ordinal | ||||
389 | values of a character as returned by C<ord(I<S>)> is the I<Unicode | ||||
390 | codepoint> for that character. The exceptions are platforms where | ||||
391 | the legacy encoding is some variant of EBCDIC rather than a superset | ||||
392 | of ASCII; see L<perlebcdic>. | ||||
393 | |||||
394 | During recent history, data is moved around a computer in 8-bit chunks, | ||||
395 | often called "bytes" but also known as "octets" in standards documents. | ||||
396 | Perl is widely used to manipulate data of many types: not only strings of | ||||
397 | characters representing human or computer languages, but also "binary" | ||||
398 | data, being the machine's representation of numbers, pixels in an image, or | ||||
399 | just about anything. | ||||
400 | |||||
401 | When Perl is processing "binary data", the programmer wants Perl to | ||||
402 | process "sequences of bytes". This is not a problem for Perl: because a | ||||
403 | byte has 256 possible values, it easily fits in Perl's much larger | ||||
404 | "logical character". | ||||
405 | |||||
406 | This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> | ||||
407 | explain the I<why>. | ||||
408 | |||||
409 | =head2 TERMINOLOGY | ||||
410 | |||||
411 | =head3 character | ||||
412 | |||||
413 | A character in the range 0 .. 2**32-1 (or more); | ||||
414 | what Perl's strings are made of. | ||||
415 | |||||
416 | =head3 byte | ||||
417 | |||||
418 | A character in the range 0..255; | ||||
419 | a special case of a Perl character. | ||||
420 | |||||
421 | =head3 octet | ||||
422 | |||||
423 | 8 bits of data, with ordinal values 0..255; | ||||
424 | term for bytes passed to or from a non-Perl context, such as a disk file, | ||||
425 | standard I/O stream, database, command-line argument, environment variable, | ||||
426 | socket etc. | ||||
427 | |||||
428 | =head1 THE PERL ENCODING API | ||||
429 | |||||
430 | =head2 Basic methods | ||||
431 | |||||
432 | =head3 encode | ||||
433 | |||||
434 | $octets = encode(ENCODING, STRING[, CHECK]) | ||||
435 | |||||
436 | Encodes the scalar value I<STRING> from Perl's internal form into | ||||
437 | I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a | ||||
438 | canonical name or an alias. For encoding names and aliases, see | ||||
439 | L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. | ||||
440 | |||||
441 | For example, to convert a string from Perl's internal format into | ||||
442 | ISO-8859-1, also known as Latin1: | ||||
443 | |||||
444 | $octets = encode("iso-8859-1", $string); | ||||
445 | |||||
446 | B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then | ||||
447 | $octets I<might not be equal to> $string. Though both contain the | ||||
448 | same data, the UTF8 flag for $octets is I<always> off. When you | ||||
449 | encode anything, the UTF8 flag on the result is always off, even when it | ||||
450 | contains a completely valid utf8 string. See L</"The UTF8 flag"> below. | ||||
451 | |||||
452 | If the $string is C<undef>, then C<undef> is returned. | ||||
453 | |||||
454 | =head3 decode | ||||
455 | |||||
456 | $string = decode(ENCODING, OCTETS[, CHECK]) | ||||
457 | |||||
458 | This function returns the string that results from decoding the scalar | ||||
459 | value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into | ||||
460 | Perl's internal form. The returns the resulting string. As with encode(), | ||||
461 | I<ENCODING> can be either a canonical name or an alias. For encoding names | ||||
462 | and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling | ||||
463 | Malformed Data">. | ||||
464 | |||||
465 | For example, to convert ISO-8859-1 data into a string in Perl's | ||||
466 | internal format: | ||||
467 | |||||
468 | $string = decode("iso-8859-1", $octets); | ||||
469 | |||||
470 | B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string | ||||
471 | I<might not be equal to> $octets. Though both contain the same data, the | ||||
472 | UTF8 flag for $string is on. See L</"The UTF8 flag"> | ||||
473 | below. | ||||
474 | |||||
475 | If the $string is C<undef>, then C<undef> is returned. | ||||
476 | |||||
477 | =head3 find_encoding | ||||
478 | |||||
479 | [$obj =] find_encoding(ENCODING) | ||||
480 | |||||
481 | Returns the I<encoding object> corresponding to I<ENCODING>. Returns | ||||
482 | C<undef> if no matching I<ENCODING> is find. The returned object is | ||||
483 | what does the actual encoding or decoding. | ||||
484 | |||||
485 | $utf8 = decode($name, $bytes); | ||||
486 | |||||
487 | is in fact | ||||
488 | |||||
489 | $utf8 = do { | ||||
490 | $obj = find_encoding($name); | ||||
491 | croak qq(encoding "$name" not found) unless ref $obj; | ||||
492 | $obj->decode($bytes); | ||||
493 | }; | ||||
494 | |||||
495 | with more error checking. | ||||
496 | |||||
497 | You can therefore save time by reusing this object as follows; | ||||
498 | |||||
499 | my $enc = find_encoding("iso-8859-1"); | ||||
500 | while(<>) { | ||||
501 | my $utf8 = $enc->decode($_); | ||||
502 | ... # now do something with $utf8; | ||||
503 | } | ||||
504 | |||||
505 | Besides L</decode> and L</encode>, other methods are | ||||
506 | available as well. For instance, C<name()> returns the canonical | ||||
507 | name of the encoding object. | ||||
508 | |||||
509 | find_encoding("latin1")->name; # iso-8859-1 | ||||
510 | |||||
511 | See L<Encode::Encoding> for details. | ||||
512 | |||||
513 | =head3 from_to | ||||
514 | |||||
515 | [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) | ||||
516 | |||||
517 | Converts I<in-place> data between two encodings. The data in $octets | ||||
518 | must be encoded as octets and I<not> as characters in Perl's internal | ||||
519 | format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 | ||||
520 | encoding: | ||||
521 | |||||
522 | from_to($octets, "iso-8859-1", "cp1250"); | ||||
523 | |||||
524 | and to convert it back: | ||||
525 | |||||
526 | from_to($octets, "cp1250", "iso-8859-1"); | ||||
527 | |||||
528 | Because the conversion happens in place, the data to be | ||||
529 | converted cannot be a string constant: it must be a scalar variable. | ||||
530 | |||||
531 | C<from_to()> returns the length of the converted string in octets on success, | ||||
532 | and C<undef> on error. | ||||
533 | |||||
534 | B<CAVEAT>: The following operations may look the same, but are not: | ||||
535 | |||||
536 | from_to($data, "iso-8859-1", "utf8"); #1 | ||||
537 | $data = decode("iso-8859-1", $data); #2 | ||||
538 | |||||
539 | Both #1 and #2 make $data consist of a completely valid UTF-8 string, | ||||
540 | but only #2 turns the UTF8 flag on. #1 is equivalent to: | ||||
541 | |||||
542 | $data = encode("utf8", decode("iso-8859-1", $data)); | ||||
543 | |||||
544 | See L</"The UTF8 flag"> below. | ||||
545 | |||||
546 | Also note that: | ||||
547 | |||||
548 | from_to($octets, $from, $to, $check); | ||||
549 | |||||
550 | is equivalent t:o | ||||
551 | |||||
552 | $octets = encode($to, decode($from, $octets), $check); | ||||
553 | |||||
554 | Yes, it does I<not> respect the $check during decoding. It is | ||||
555 | deliberately done that way. If you need minute control, use C<decode> | ||||
556 | followed by C<encode> as follows: | ||||
557 | |||||
558 | $octets = encode($to, decode($from, $octets, $check_from), $check_to); | ||||
559 | |||||
560 | =head3 encode_utf8 | ||||
561 | |||||
562 | $octets = encode_utf8($string); | ||||
563 | |||||
564 | Equivalent to C<$octets = encode("utf8", $string)>. The characters in | ||||
565 | $string are encoded in Perl's internal format, and the result is returned | ||||
566 | as a sequence of octets. Because all possible characters in Perl have a | ||||
567 | (loose, not strict) UTF-8 representation, this function cannot fail. | ||||
568 | |||||
569 | =head3 decode_utf8 | ||||
570 | |||||
571 | $string = decode_utf8($octets [, CHECK]); | ||||
572 | |||||
573 | Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. | ||||
574 | The sequence of octets represented by $octets is decoded | ||||
575 | from UTF-8 into a sequence of logical characters. | ||||
576 | Because not all sequences of octets are valid UTF-8, | ||||
577 | it is quite possible for this function to fail. | ||||
578 | For CHECK, see L</"Handling Malformed Data">. | ||||
579 | |||||
580 | =head2 Listing available encodings | ||||
581 | |||||
582 | use Encode; | ||||
583 | @list = Encode->encodings(); | ||||
584 | |||||
585 | Returns a list of canonical names of available encodings that have already | ||||
586 | been loaded. To get a list of all available encodings including those that | ||||
587 | have not yet been loaded, say: | ||||
588 | |||||
589 | @all_encodings = Encode->encodings(":all"); | ||||
590 | |||||
591 | Or you can give the name of a specific module: | ||||
592 | |||||
593 | @with_jp = Encode->encodings("Encode::JP"); | ||||
594 | |||||
595 | When "C<::>" is not in the name, "C<Encode::>" is assumed. | ||||
596 | |||||
597 | @ebcdic = Encode->encodings("EBCDIC"); | ||||
598 | |||||
599 | To find out in detail which encodings are supported by this package, | ||||
600 | see L<Encode::Supported>. | ||||
601 | |||||
602 | =head2 Defining Aliases | ||||
603 | |||||
604 | To add a new alias to a given encoding, use: | ||||
605 | |||||
606 | use Encode; | ||||
607 | use Encode::Alias; | ||||
608 | define_alias(NEWNAME => ENCODING); | ||||
609 | |||||
610 | After that, I<NEWNAME> can be used as an alias for I<ENCODING>. | ||||
611 | I<ENCODING> may be either the name of an encoding or an | ||||
612 | I<encoding object>. | ||||
613 | |||||
614 | Before you do that, first make sure the alias is nonexistent using | ||||
615 | C<resolve_alias()>, which returns the canonical name thereof. | ||||
616 | For example: | ||||
617 | |||||
618 | Encode::resolve_alias("latin1") eq "iso-8859-1" # true | ||||
619 | Encode::resolve_alias("iso-8859-12") # false; nonexistent | ||||
620 | Encode::resolve_alias($name) eq $name # true if $name is canonical | ||||
621 | |||||
622 | C<resolve_alias()> does not need C<use Encode::Alias>; it can be | ||||
623 | imported via C<use Encode qw(resolve_alias)>. | ||||
624 | |||||
625 | See L<Encode::Alias> for details. | ||||
626 | |||||
627 | =head2 Finding IANA Character Set Registry names | ||||
628 | |||||
629 | The canonical name of a given encoding does not necessarily agree with | ||||
630 | IANA Character Set Registry, commonly seen as C<< Content-Type: | ||||
631 | text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name | ||||
632 | works, but sometimes it does not, most notably with "utf-8-strict". | ||||
633 | |||||
634 | As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. | ||||
635 | |||||
636 | use Encode; | ||||
637 | my $enc = find_encoding("UTF-8"); | ||||
638 | warn $enc->name; # utf-8-strict | ||||
639 | warn $enc->mime_name; # UTF-8 | ||||
640 | |||||
641 | See also: L<Encode::Encoding> | ||||
642 | |||||
643 | =head1 Encoding via PerlIO | ||||
644 | |||||
645 | If your perl supports C<PerlIO> (which is the default), you can use a | ||||
646 | C<PerlIO> layer to decode and encode directly via a filehandle. The | ||||
647 | following two examples are fully identical in functionality: | ||||
648 | |||||
649 | ### Version 1 via PerlIO | ||||
650 | open(INPUT, "< :encoding(shiftjis)", $infile) | ||||
651 | || die "Can't open < $infile for reading: $!"; | ||||
652 | open(OUTPUT, "> :encoding(euc-jp)", $outfile) | ||||
653 | || die "Can't open > $output for writing: $!"; | ||||
654 | while (<INPUT>) { # auto decodes $_ | ||||
655 | print OUTPUT; # auto encodes $_ | ||||
656 | } | ||||
657 | close(INPUT) || die "can't close $infile: $!"; | ||||
658 | close(OUTPUT) || die "can't close $outfile: $!"; | ||||
659 | |||||
660 | ### Version 2 via from_to() | ||||
661 | open(INPUT, "< :raw", $infile) | ||||
662 | || die "Can't open < $infile for reading: $!"; | ||||
663 | open(OUTPUT, "> :raw", $outfile) | ||||
664 | || die "Can't open > $output for writing: $!"; | ||||
665 | |||||
666 | while (<INPUT>) { | ||||
667 | from_to($_, "shiftjis", "euc-jp", 1); # switch encoding | ||||
668 | print OUTPUT; # emit raw (but properly encoded) data | ||||
669 | } | ||||
670 | close(INPUT) || die "can't close $infile: $!"; | ||||
671 | close(OUTPUT) || die "can't close $outfile: $!"; | ||||
672 | |||||
673 | In the first version above, you let the appropriate encoding layer | ||||
674 | handle the conversion. In the second, you explicitly translate | ||||
675 | from one encoding to the other. | ||||
676 | |||||
677 | Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check | ||||
678 | to see whether your encoding is supported by C<PerlIO> by invoking the | ||||
679 | C<perlio_ok> method on it: | ||||
680 | |||||
681 | Encode::perlio_ok("hz"); # false | ||||
682 | find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available | ||||
683 | |||||
684 | use Encode qw(perlio_ok); # imported upon request | ||||
685 | perlio_ok("euc-jp") | ||||
686 | |||||
687 | Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy | ||||
688 | except for C<hz> and C<ISO-2022-kr>. For the gory details, see | ||||
689 | L<Encode::Encoding> and L<Encode::PerlIO>. | ||||
690 | |||||
691 | =head1 Handling Malformed Data | ||||
692 | |||||
693 | The optional I<CHECK> argument tells C<Encode> what to do when | ||||
694 | encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> | ||||
695 | (== 0) is assumed. | ||||
696 | |||||
697 | As of version 2.12, C<Encode> supports coderef values for C<CHECK>; | ||||
698 | see below. | ||||
699 | |||||
700 | B<NOTE:> Not all encodings support this feature. | ||||
701 | Some encodings ignore the I<CHECK> argument. For example, | ||||
702 | L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. | ||||
703 | |||||
704 | =head2 List of I<CHECK> values | ||||
705 | |||||
706 | =head3 FB_DEFAULT | ||||
707 | |||||
708 | I<CHECK> = Encode::FB_DEFAULT ( == 0) | ||||
709 | |||||
710 | If I<CHECK> is 0, encoding and decoding replace any malformed character | ||||
711 | with a I<substitution character>. When you encode, I<SUBCHAR> is used. | ||||
712 | When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is | ||||
713 | used. If the data is supposed to be UTF-8, an optional lexical warning of | ||||
714 | warning category C<"utf8"> is given. | ||||
715 | |||||
716 | =head3 FB_CROAK | ||||
717 | |||||
718 | I<CHECK> = Encode::FB_CROAK ( == 1) | ||||
719 | |||||
720 | If I<CHECK> is 1, methods immediately die with an error | ||||
721 | message. Therefore, when I<CHECK> is 1, you should trap | ||||
722 | exceptions with C<eval{}>, unless you really want to let it C<die>. | ||||
723 | |||||
724 | =head3 FB_QUIET | ||||
725 | |||||
726 | I<CHECK> = Encode::FB_QUIET | ||||
727 | |||||
728 | If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately | ||||
729 | return the portion of the data that has been processed so far when an | ||||
730 | error occurs. The data argument is overwritten with everything | ||||
731 | after that point; that is, the unprocessed portion of the data. This is | ||||
732 | handy when you have to call C<decode> repeatedly in the case where your | ||||
733 | source data may contain partial multi-byte character sequences, | ||||
734 | (that is, you are reading with a fixed-width buffer). Here's some sample | ||||
735 | code to do exactly that: | ||||
736 | |||||
737 | my($buffer, $string) = ("", ""); | ||||
738 | while (read($fh, $buffer, 256, length($buffer))) { | ||||
739 | $string .= decode($encoding, $buffer, Encode::FB_QUIET); | ||||
740 | # $buffer now contains the unprocessed partial character | ||||
741 | } | ||||
742 | |||||
743 | =head3 FB_WARN | ||||
744 | |||||
745 | I<CHECK> = Encode::FB_WARN | ||||
746 | |||||
747 | This is the same as C<FB_QUIET> above, except that instead of being silent | ||||
748 | on errors, it issues a warning. This is handy for when you are debugging. | ||||
749 | |||||
750 | =head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF | ||||
751 | |||||
752 | =over 2 | ||||
753 | |||||
754 | =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) | ||||
755 | |||||
756 | =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) | ||||
757 | |||||
758 | =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) | ||||
759 | |||||
760 | =back | ||||
761 | |||||
762 | For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> | ||||
763 | C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. | ||||
764 | |||||
765 | When you decode, C<\xI<HH>> is inserted for a malformed character, where | ||||
766 | I<HH> is the hex representation of the octet that could not be decoded to | ||||
767 | utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is | ||||
768 | the Unicode code point (in any number of hex digits) of the character that | ||||
769 | cannot be found in the character repertoire of the encoding. | ||||
770 | |||||
771 | The HTML/XML character reference modes are about the same. In place of | ||||
772 | C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and | ||||
773 | XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. | ||||
774 | |||||
775 | In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. | ||||
776 | |||||
777 | =head3 The bitmask | ||||
778 | |||||
779 | These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> | ||||
780 | constants are laid out. You can import the C<FB_I<XXX>> constants via | ||||
781 | C<use Encode qw(:fallbacks)>, and you can import the generic bitmask | ||||
782 | constants via C<use Encode qw(:fallback_all)>. | ||||
783 | |||||
784 | FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ | ||||
785 | DIE_ON_ERR 0x0001 X | ||||
786 | WARN_ON_ERR 0x0002 X | ||||
787 | RETURN_ON_ERR 0x0004 X X | ||||
788 | LEAVE_SRC 0x0008 X | ||||
789 | PERLQQ 0x0100 X | ||||
790 | HTMLCREF 0x0200 | ||||
791 | XMLCREF 0x0400 | ||||
792 | |||||
793 | =head3 LEAVE_SRC | ||||
794 | |||||
795 | Encode::LEAVE_SRC | ||||
796 | |||||
797 | If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the | ||||
798 | source string to encode() or decode() will be overwritten in place. | ||||
799 | If you're not interested in this, then bitwise-OR it with the bitmask. | ||||
800 | |||||
801 | =head2 coderef for CHECK | ||||
802 | |||||
803 | As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the | ||||
804 | ordinal value of the unmapped character as an argument and returns | ||||
805 | octets that represent the fallback character. For instance: | ||||
806 | |||||
807 | $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); | ||||
808 | |||||
809 | Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. | ||||
810 | |||||
811 | Even the fallback for C<decode> must return octets, which are | ||||
812 | then decoded with the character encoding that C<decode> accepts. So for | ||||
813 | example if you wish to decode octests as UTF-8, and use ISO-8859-15 as | ||||
814 | a fallback for bytes that are not valid UTF-8, you could write | ||||
815 | |||||
816 | $str = decode 'UTF-8', $octets, sub { | ||||
817 | my $tmp = chr shift; | ||||
818 | from_to $tmp, 'ISO-8859-15', 'UTF-8'; | ||||
819 | return $tmp; | ||||
820 | }; | ||||
821 | |||||
822 | =head1 Defining Encodings | ||||
823 | |||||
824 | To define a new encoding, use: | ||||
825 | |||||
826 | use Encode qw(define_encoding); | ||||
827 | define_encoding($object, CANONICAL_NAME [, alias...]); | ||||
828 | |||||
829 | I<CANONICAL_NAME> will be associated with I<$object>. The object | ||||
830 | should provide the interface described in L<Encode::Encoding>. | ||||
831 | If more than two arguments are provided, additional | ||||
832 | arguments are considered aliases for I<$object>. | ||||
833 | |||||
834 | See L<Encode::Encoding> for details. | ||||
835 | |||||
836 | =head1 The UTF8 flag | ||||
837 | |||||
838 | Before the introduction of Unicode support in Perl, The C<eq> operator | ||||
839 | just compared the strings represented by two scalars. Beginning with | ||||
840 | Perl 5.8, C<eq> compares two strings with simultaneous consideration of | ||||
841 | I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of | ||||
842 | I<Programming Perl, 3rd ed.> | ||||
843 | |||||
844 | =over 2 | ||||
845 | |||||
846 | =item Goal #1: | ||||
847 | |||||
848 | Old byte-oriented programs should not spontaneously break on the old | ||||
849 | byte-oriented data they used to work on. | ||||
850 | |||||
851 | =item Goal #2: | ||||
852 | |||||
853 | Old byte-oriented programs should magically start working on the new | ||||
854 | character-oriented data when appropriate. | ||||
855 | |||||
856 | =item Goal #3: | ||||
857 | |||||
858 | Programs should run just as fast in the new character-oriented mode | ||||
859 | as in the old byte-oriented mode. | ||||
860 | |||||
861 | =item Goal #4: | ||||
862 | |||||
863 | Perl should remain one language, rather than forking into a | ||||
864 | byte-oriented Perl and a character-oriented Perl. | ||||
865 | |||||
866 | =back | ||||
867 | |||||
868 | When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been | ||||
869 | born yet, many features documented in the book remained unimplemented for a | ||||
870 | long time. Perl 5.8 corrected much of this, and the introduction of the | ||||
871 | UTF8 flag is one of them. You can think of there being two fundamentally | ||||
872 | different kinds of strings and string-operations in Perl: one a | ||||
873 | byte-oriented mode for when the internal UTF8 flag is off, and the other a | ||||
874 | character-oriented mode for when the internal UTF8 flag is on. | ||||
875 | |||||
876 | Here is how C<Encode> handles the UTF8 flag. | ||||
877 | |||||
878 | =over 2 | ||||
879 | |||||
880 | =item * | ||||
881 | |||||
882 | When you I<encode>, the resulting UTF8 flag is always B<off>. | ||||
883 | |||||
884 | =item * | ||||
885 | |||||
886 | When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can | ||||
887 | unambiguously represent data. Here is what we mean by "unambiguously". | ||||
888 | After C<$utf8 = decode("foo", $octet)>, | ||||
889 | |||||
890 | When $octet is... The UTF8 flag in $utf8 is | ||||
891 | --------------------------------------------- | ||||
892 | In ASCII only (or EBCDIC only) OFF | ||||
893 | In ISO-8859-1 ON | ||||
894 | In any other Encoding ON | ||||
895 | --------------------------------------------- | ||||
896 | |||||
897 | As you see, there is one exception: in ASCII. That way you can assume | ||||
898 | Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be | ||||
899 | careful in the cases mentioned in the B<CAVEAT> paragraphs above. | ||||
900 | |||||
901 | This UTF8 flag is not visible in Perl scripts, exactly for the same reason | ||||
902 | you cannot (or rather, you I<don't have to>) see whether a scalar contains | ||||
903 | a string, an integer, or a floating-point number. But you can still peek | ||||
904 | and poke these if you will. See the next section. | ||||
905 | |||||
906 | =back | ||||
907 | |||||
908 | =head2 Messing with Perl's Internals | ||||
909 | |||||
910 | The following API uses parts of Perl's internals in the current | ||||
911 | implementation. As such, they are efficient but may change in a future | ||||
912 | release. | ||||
913 | |||||
914 | =head3 is_utf8 | ||||
915 | |||||
916 | is_utf8(STRING [, CHECK]) | ||||
917 | |||||
918 | [INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. | ||||
919 | If I<CHECK> is true, also checks whether I<STRING> contains well-formed | ||||
920 | UTF-8. Returns true if successful, false otherwise. | ||||
921 | |||||
922 | As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. | ||||
923 | |||||
924 | =head3 _utf8_on | ||||
925 | |||||
926 | _utf8_on(STRING) | ||||
927 | |||||
928 | [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> | ||||
929 | is I<not> checked for containing only well-formed UTF-8. Do not use this | ||||
930 | unless you I<know with absolute certainty> that the STRING holds only | ||||
931 | well-formed UTF-8. Returns the previous state of the UTF8 flag (so please | ||||
932 | don't treat the return value as indicating success or failure), or C<undef> | ||||
933 | if I<STRING> is not a string. | ||||
934 | |||||
935 | B<NOTE>: For security reasons, this function does not work on tainted values. | ||||
936 | |||||
937 | =head3 _utf8_off | ||||
938 | |||||
939 | _utf8_off(STRING) | ||||
940 | |||||
941 | [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use | ||||
942 | frivolously. Returns the previous state of the UTF8 flag, or C<undef> if | ||||
943 | I<STRING> is not a string. Do not treat the return value as indicative of | ||||
944 | success or failure, because that isn't what it means: it is only the | ||||
945 | previous setting. | ||||
946 | |||||
947 | B<NOTE>: For security reasons, this function does not work on tainted values. | ||||
948 | |||||
949 | =head1 UTF-8 vs. utf8 vs. UTF8 | ||||
950 | |||||
951 | ....We now view strings not as sequences of bytes, but as sequences | ||||
952 | of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit | ||||
953 | computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. | ||||
954 | |||||
955 | That has historically been Perl's notion of UTF-8, as that is how UTF-8 was | ||||
956 | first conceived by Ken Thompson when he invented it. However, thanks to | ||||
957 | later revisions to the applicable standards, official UTF-8 is now rather | ||||
958 | stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF | ||||
959 | to cover only 21 bits instead of 32 or 64 bits) and some sequences | ||||
960 | are not allowed, like those used in surrogate pairs, the 31 non-character | ||||
961 | code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane | ||||
962 | (0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. | ||||
963 | |||||
964 | The former default in which Perl would always use a loose interpretation of | ||||
965 | UTF-8 has now been overruled: | ||||
966 | |||||
967 | From: Larry Wall <larry@wall.org> | ||||
968 | Date: December 04, 2004 11:51:58 JST | ||||
969 | To: perl-unicode@perl.org | ||||
970 | Subject: Re: Make Encode.pm support the real UTF-8 | ||||
971 | Message-Id: <20041204025158.GA28754@wall.org> | ||||
972 | |||||
973 | On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: | ||||
974 | : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, | ||||
975 | : but "UTF-8" is the name of the standard and should give the | ||||
976 | : corresponding behaviour. | ||||
977 | |||||
978 | For what it's worth, that's how I've always kept them straight in my | ||||
979 | head. | ||||
980 | |||||
981 | Also for what it's worth, Perl 6 will mostly default to strict but | ||||
982 | make it easy to switch back to lax. | ||||
983 | |||||
984 | Larry | ||||
985 | |||||
986 | Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current | ||||
987 | sense, which is conservative and strict and security-conscious, whereas | ||||
988 | B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and | ||||
989 | lax. C<Encode> version 2.10 or later thus groks this subtle but critically | ||||
990 | important distinction between C<"UTF-8"> and C<"utf8">. | ||||
991 | |||||
992 | encode("utf8", "\x{FFFF_FFFF}", 1); # okay | ||||
993 | encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks | ||||
994 | |||||
995 | In the C<Encode> module, C<"UTF-8"> is actually a canonical name for | ||||
996 | C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is | ||||
997 | critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: | ||||
998 | |||||
999 | find_encoding("UTF-8")->name # is 'utf-8-strict' | ||||
1000 | find_encoding("utf-8")->name # ditto. names are case insensitive | ||||
1001 | find_encoding("utf_8")->name # ditto. "_" are treated as "-" | ||||
1002 | find_encoding("UTF8")->name # is 'utf8'. | ||||
1003 | |||||
1004 | Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates | ||||
1005 | whether a string is internally encoded as "utf8", also without a hyphen. | ||||
1006 | |||||
1007 | =head1 SEE ALSO | ||||
1008 | |||||
1009 | L<Encode::Encoding>, | ||||
1010 | L<Encode::Supported>, | ||||
1011 | L<Encode::PerlIO>, | ||||
1012 | L<encoding>, | ||||
1013 | L<perlebcdic>, | ||||
1014 | L<perlfunc/open>, | ||||
1015 | L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> | ||||
1016 | L<utf8>, | ||||
1017 | the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> | ||||
1018 | |||||
1019 | =head1 MAINTAINER | ||||
1020 | |||||
1021 | This project was originated by the late Nick Ing-Simmons and later | ||||
1022 | maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS | ||||
1023 | for a full list of people involved. For any questions, send mail to | ||||
1024 | I<< <perl-unicode@perl.org> >> so that we can all share. | ||||
1025 | |||||
1026 | While Dan Kogai retains the copyright as a maintainer, credit | ||||
1027 | should go to all those involved. See AUTHORS for a list of those | ||||
1028 | who submitted code to the project. | ||||
1029 | |||||
1030 | =head1 COPYRIGHT | ||||
1031 | |||||
1032 | Copyright 2002-2013 Dan Kogai I<< <dankogai@cpan.org> >>. | ||||
1033 | |||||
1034 | This library is free software; you can redistribute it and/or modify | ||||
1035 | it under the same terms as Perl itself. | ||||
1036 | |||||
1037 | =cut |