Profile of Encode.pm

Filename	/usr/lib/x86_64-linux-gnu/perl/5.20/Encode.pm
Statements	Executed 0 statements in 0s

Line	Calls	Time in subs	Code
1			#
2			# $Id: Encode.pm,v 2.60 2014/04/29 16:26:49 dankogai Exp dankogai $
3			#
4			package Encode;
5			use strict;
6			use warnings;
7			our $VERSION = sprintf "%d.%02d", q$Revision: 2.60_01 $ =~ /(\d+)/g;
8			use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
9			use XSLoader ();
10			XSLoader::load( __PACKAGE__, $VERSION );
11
12			use Exporter 5.57 'import';
13
14			# Public, encouraged API is exported by default
15
16			our @EXPORT = qw(
17			decode decode_utf8 encode encode_utf8 str2bytes bytes2str
18			encodings find_encoding clone_encoding
19			);
20			our @FB_FLAGS = qw(
21			DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22			PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
23			);
24			our @FB_CONSTS = qw(
25			FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
26			FB_PERLQQ FB_HTMLCREF FB_XMLCREF
27			);
28			our @EXPORT_OK = (
29			qw(
30			_utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
31			is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
32			),
33			@FB_FLAGS, @FB_CONSTS,
34			);
35
36			our %EXPORT_TAGS = (
37			all => [ @EXPORT, @EXPORT_OK ],
38			default => [ @EXPORT ],
39			fallbacks => [ @FB_CONSTS ],
40			fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
41			);
42
43			# Documentation moved after __END__ for speed - NI-S
44
45			our $ON_EBCDIC = ( ord("A") == 193 );
46
47			use Encode::Alias;
48
49			# Make a %Encoding package variable to allow a certain amount of cheating
50			our %Encoding;
51			our %ExtModule;
52			require Encode::Config;
53			# See
54			# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
55			# to find why sig handlers inside eval{} are disabled.
56			eval {
57			local $SIG{__DIE__};
58			local $SIG{__WARN__};
59			require Encode::ConfigLocal;
60			};
61
62			sub encodings {
63			my %enc;
64			my $arg = $_[1] \|\| '';
65			if ( $arg eq ":all" ) {
66			%enc = ( %Encoding, %ExtModule );
67			}
68			else {
69			%enc = %Encoding;
70			for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
71			DEBUG and warn $mod;
72			for my $enc ( keys %ExtModule ) {
73			$ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
74			}
75			}
76			}
77			return sort { lc $a cmp lc $b }
78			grep { !/^(?:Internal\|Unicode\|Guess)$/o } keys %enc;
79			}
80
81			sub perlio_ok {
82			my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
83			$obj->can("perlio_ok") and return $obj->perlio_ok();
84			return 0; # safety net
85			}
86
87			sub define_encoding {
88			my $obj = shift;
89			my $name = shift;
90			$Encoding{$name} = $obj;
91			my $lc = lc($name);
92			define_alias( $lc => $obj ) unless $lc eq $name;
93			while (@_) {
94			my $alias = shift;
95			define_alias( $alias, $obj );
96			}
97			return $obj;
98			}
99
100			sub getEncoding {
101			my ( $class, $name, $skip_external ) = @_;
102
103			$name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
104
105			ref($name) && $name->can('renew') and return $name;
106			exists $Encoding{$name} and return $Encoding{$name};
107			my $lc = lc $name;
108			exists $Encoding{$lc} and return $Encoding{$lc};
109
110			my $oc = $class->find_alias($name);
111			defined($oc) and return $oc;
112			$lc ne $name and $oc = $class->find_alias($lc);
113			defined($oc) and return $oc;
114
115			unless ($skip_external) {
116			if ( my $mod = $ExtModule{$name} \|\| $ExtModule{$lc} ) {
117			$mod =~ s,::,/,g;
118			$mod .= '.pm';
119			eval { require $mod; };
120			exists $Encoding{$name} and return $Encoding{$name};
121			}
122			}
123			return;
124			}
125
126			sub find_encoding($;$) {
127			my ( $name, $skip_external ) = @_;
128			return __PACKAGE__->getEncoding( $name, $skip_external );
129			}
130
131			sub resolve_alias($) {
132			my $obj = find_encoding(shift);
133			defined $obj and return $obj->name;
134			return;
135			}
136
137			sub clone_encoding($) {
138			my $obj = find_encoding(shift);
139			ref $obj or return;
140			eval { require Storable };
141			$@ and return;
142			return Storable::dclone($obj);
143			}
144
145			sub encode($$;$) {
146			my ( $name, $string, $check ) = @_;
147			return undef unless defined $string;
148	12	24µs	$string .= ''; # stringify; # spent 24µs making 12 calls to Text::MicroTemplate::EncodedString::__ANON__, avg 2µs/call
149			$check \|\|= 0;
150			unless ( defined $name ) {
151			require Carp;
152			Carp::croak("Encoding name should not be undef");
153			}
154			my $enc = find_encoding($name);
155			unless ( defined $enc ) {
156			require Carp;
157			Carp::croak("Unknown encoding '$name'");
158			}
159			my $octets = $enc->encode( $string, $check );
160			$_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
161			return $octets;
162			}
163			*str2bytes = \&encode;
164
165			sub decode($$;$) {
166			my ( $name, $octets, $check ) = @_;
167			return undef unless defined $octets;
168			$octets .= '';
169			$check \|\|= 0;
170			my $enc = find_encoding($name);
171			unless ( defined $enc ) {
172			require Carp;
173			Carp::croak("Unknown encoding '$name'");
174			}
175	90	208µs	my $string = $enc->decode( $octets, $check ); # spent 208µs making 90 calls to Encode::Encoding::renewed, avg 2µs/call
176			$_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
177			return $string;
178			}
179			*bytes2str = \&decode;
180
181			sub from_to($$$;$) {
182			my ( $string, $from, $to, $check ) = @_;
183			return undef unless defined $string;
184			$check \|\|= 0;
185			my $f = find_encoding($from);
186			unless ( defined $f ) {
187			require Carp;
188			Carp::croak("Unknown encoding '$from'");
189			}
190			my $t = find_encoding($to);
191			unless ( defined $t ) {
192			require Carp;
193			Carp::croak("Unknown encoding '$to'");
194			}
195			my $uni = $f->decode($string);
196			$_[0] = $string = $t->encode( $uni, $check );
197			return undef if ( $check && length($uni) );
198			return defined( $_[0] ) ? length($string) : undef;
199			}
200
201			sub encode_utf8($) {
202			my ($str) = @_;
203			utf8::encode($str);
204			return $str;
205			}
206
207			my $utf8enc;
208
209			sub decode_utf8($;$) {
210			my ( $octets, $check ) = @_;
211			return undef unless defined $octets;
212			$octets .= '';
213			$check \|\|= 0;
214			$utf8enc \|\|= find_encoding('utf8');
215			my $string = $utf8enc->decode( $octets, $check );
216			$_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
217			return $string;
218			}
219
220			# sub decode_utf8($;$) {
221			# my ( $str, $check ) = @_;
222			# return $str if is_utf8($str);
223			# if ($check) {
224			# return decode( "utf8", $str, $check );
225			# }
226			# else {
227			# return decode( "utf8", $str );
228			# return $str;
229			# }
230			# }
231
232			predefine_encodings(1);
233
234			#
235			# This is to restore %Encoding if really needed;
236			#
237
238			sub predefine_encodings {
239			require Encode::Encoding;
240			no warnings 'redefine';
241			my $use_xs = shift;
242			if ($ON_EBCDIC) {
243
244			# was in Encode::UTF_EBCDIC
245			package Encode::UTF_EBCDIC;
246			push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
247			*decode = sub {
248			my ( undef, $str, $chk ) = @_;
249			my $res = '';
250			for ( my $i = 0 ; $i < length($str) ; $i++ ) {
251			$res .=
252			chr(
253			utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
254			);
255			}
256			$_[1] = '' if $chk;
257			return $res;
258			};
259			*encode = sub {
260			my ( undef, $str, $chk ) = @_;
261			my $res = '';
262			for ( my $i = 0 ; $i < length($str) ; $i++ ) {
263			$res .=
264			chr(
265			utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
266			);
267			}
268			$_[1] = '' if $chk;
269			return $res;
270			};
271			$Encode::Encoding{Unicode} =
272			bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
273			}
274			else {
275
276			package Encode::Internal;
277			push @Encode::Internal::ISA, 'Encode::Encoding';
278			*decode = sub {
279			my ( undef, $str, $chk ) = @_;
280			utf8::upgrade($str);
281			$_[1] = '' if $chk;
282			return $str;
283			};
284			*encode = \&decode;
285			$Encode::Encoding{Unicode} =
286			bless { Name => "Internal" } => "Encode::Internal";
287			}
288
289			{
290
291			# was in Encode::utf8
292			package Encode::utf8;
293			push @Encode::utf8::ISA, 'Encode::Encoding';
294
295			#
296			if ($use_xs) {
297			Encode::DEBUG and warn __PACKAGE__, " XS on";
298			*decode = \&decode_xs;
299			*encode = \&encode_xs;
300			}
301			else {
302			Encode::DEBUG and warn __PACKAGE__, " XS off";
303			*decode = sub {
304			my ( undef, $octets, $chk ) = @_;
305			my $str = Encode::decode_utf8($octets);
306			if ( defined $str ) {
307			$_[1] = '' if $chk;
308			return $str;
309			}
310			return undef;
311			};
312			*encode = sub {
313			my ( undef, $string, $chk ) = @_;
314			my $octets = Encode::encode_utf8($string);
315			$_[1] = '' if $chk;
316			return $octets;
317			};
318			}
319			*cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
320			# currently ignores $chk
321			my ( undef, undef, undef, $pos, $trm ) = @_;
322			my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
323			use bytes;
324			if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
325			$$rdst .=
326			substr( $$rsrc, $pos, $npos - $pos + length($trm) );
327			$$rpos = $npos + length($trm);
328			return 1;
329			}
330			$$rdst .= substr( $$rsrc, $pos );
331			$$rpos = length($$rsrc);
332			return '';
333			};
334			$Encode::Encoding{utf8} =
335			bless { Name => "utf8" } => "Encode::utf8";
336			$Encode::Encoding{"utf-8-strict"} =
337			bless { Name => "utf-8-strict", strict_utf8 => 1 }
338			=> "Encode::utf8";
339			}
340			}
341
342			1;
343
344			__END__
345
346			=head1 NAME
347
348			Encode - character encodings in Perl
349
350			=head1 SYNOPSIS
351
352			use Encode qw(decode encode);
353			$characters = decode('UTF-8', $octets, Encode::FB_CROAK);
354			$octets = encode('UTF-8', $characters, Encode::FB_CROAK);
355
356			=head2 Table of Contents
357
358			Encode consists of a collection of modules whose details are too extensive
359			to fit in one document. This one itself explains the top-level APIs
360			and general topics at a glance. For other topics and more details,
361			see the documentation for these modules:
362
363			=over 2
364
365			=item L<Encode::Alias> - Alias definitions to encodings
366
367			=item L<Encode::Encoding> - Encode Implementation Base Class
368
369			=item L<Encode::Supported> - List of Supported Encodings
370
371			=item L<Encode::CN> - Simplified Chinese Encodings
372
373			=item L<Encode::JP> - Japanese Encodings
374
375			=item L<Encode::KR> - Korean Encodings
376
377			=item L<Encode::TW> - Traditional Chinese Encodings
378
379			=back
380
381			=head1 DESCRIPTION
382
383			The C<Encode> module provides the interface between Perl strings
384			and the rest of the system. Perl strings are sequences of
385			I<characters>.
386
387			The repertoire of characters that Perl can represent is a superset of those
388			defined by the Unicode Consortium. On most platforms the ordinal
389			values of a character as returned by C<ord(I<S>)> is the I<Unicode
390			codepoint> for that character. The exceptions are platforms where
391			the legacy encoding is some variant of EBCDIC rather than a superset
392			of ASCII; see L<perlebcdic>.
393
394			During recent history, data is moved around a computer in 8-bit chunks,
395			often called "bytes" but also known as "octets" in standards documents.
396			Perl is widely used to manipulate data of many types: not only strings of
397			characters representing human or computer languages, but also "binary"
398			data, being the machine's representation of numbers, pixels in an image, or
399			just about anything.
400
401			When Perl is processing "binary data", the programmer wants Perl to
402			process "sequences of bytes". This is not a problem for Perl: because a
403			byte has 256 possible values, it easily fits in Perl's much larger
404			"logical character".
405
406			This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
407			explain the I<why>.
408
409			=head2 TERMINOLOGY
410
411			=head3 character
412
413			A character in the range 0 .. 2**32-1 (or more);
414			what Perl's strings are made of.
415
416			=head3 byte
417
418			A character in the range 0..255;
419			a special case of a Perl character.
420
421			=head3 octet
422
423			8 bits of data, with ordinal values 0..255;
424			term for bytes passed to or from a non-Perl context, such as a disk file,
425			standard I/O stream, database, command-line argument, environment variable,
426			socket etc.
427
428			=head1 THE PERL ENCODING API
429
430			=head2 Basic methods
431
432			=head3 encode
433
434			$octets = encode(ENCODING, STRING[, CHECK])
435
436			Encodes the scalar value I<STRING> from Perl's internal form into
437			I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a
438			canonical name or an alias. For encoding names and aliases, see
439			L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">.
440
441			For example, to convert a string from Perl's internal format into
442			ISO-8859-1, also known as Latin1:
443
444			$octets = encode("iso-8859-1", $string);
445
446			B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
447			$octets I<might not be equal to> $string. Though both contain the
448			same data, the UTF8 flag for $octets is I<always> off. When you
449			encode anything, the UTF8 flag on the result is always off, even when it
450			contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
451
452			If the $string is C<undef>, then C<undef> is returned.
453
454			=head3 decode
455
456			$string = decode(ENCODING, OCTETS[, CHECK])
457
458			This function returns the string that results from decoding the scalar
459			value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
460			Perl's internal form. The returns the resulting string. As with encode(),
461			I<ENCODING> can be either a canonical name or an alias. For encoding names
462			and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
463			Malformed Data">.
464
465			For example, to convert ISO-8859-1 data into a string in Perl's
466			internal format:
467
468			$string = decode("iso-8859-1", $octets);
469
470			B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
471			I<might not be equal to> $octets. Though both contain the same data, the
472			UTF8 flag for $string is on. See L</"The UTF8 flag">
473			below.
474
475			If the $string is C<undef>, then C<undef> is returned.
476
477			=head3 find_encoding
478
479			[$obj =] find_encoding(ENCODING)
480
481			Returns the I<encoding object> corresponding to I<ENCODING>. Returns
482			C<undef> if no matching I<ENCODING> is find. The returned object is
483			what does the actual encoding or decoding.
484
485			$utf8 = decode($name, $bytes);
486
487			is in fact
488
489			$utf8 = do {
490			$obj = find_encoding($name);
491			croak qq(encoding "$name" not found) unless ref $obj;
492			$obj->decode($bytes);
493			};
494
495			with more error checking.
496
497			You can therefore save time by reusing this object as follows;
498
499			my $enc = find_encoding("iso-8859-1");
500			while(<>) {
501			my $utf8 = $enc->decode($_);
502			... # now do something with $utf8;
503			}
504
505			Besides L</decode> and L</encode>, other methods are
506			available as well. For instance, C<name()> returns the canonical
507			name of the encoding object.
508
509			find_encoding("latin1")->name; # iso-8859-1
510
511			See L<Encode::Encoding> for details.
512
513			=head3 from_to
514
515			[$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
516
517			Converts I<in-place> data between two encodings. The data in $octets
518			must be encoded as octets and I<not> as characters in Perl's internal
519			format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
520			encoding:
521
522			from_to($octets, "iso-8859-1", "cp1250");
523
524			and to convert it back:
525
526			from_to($octets, "cp1250", "iso-8859-1");
527
528			Because the conversion happens in place, the data to be
529			converted cannot be a string constant: it must be a scalar variable.
530
531			C<from_to()> returns the length of the converted string in octets on success,
532			and C<undef> on error.
533
534			B<CAVEAT>: The following operations may look the same, but are not:
535
536			from_to($data, "iso-8859-1", "utf8"); #1
537			$data = decode("iso-8859-1", $data); #2
538
539			Both #1 and #2 make $data consist of a completely valid UTF-8 string,
540			but only #2 turns the UTF8 flag on. #1 is equivalent to:
541
542			$data = encode("utf8", decode("iso-8859-1", $data));
543
544			See L</"The UTF8 flag"> below.
545
546			Also note that:
547
548			from_to($octets, $from, $to, $check);
549
550			is equivalent t:o
551
552			$octets = encode($to, decode($from, $octets), $check);
553
554			Yes, it does I<not> respect the $check during decoding. It is
555			deliberately done that way. If you need minute control, use C<decode>
556			followed by C<encode> as follows:
557
558			$octets = encode($to, decode($from, $octets, $check_from), $check_to);
559
560			=head3 encode_utf8
561
562			$octets = encode_utf8($string);
563
564			Equivalent to C<$octets = encode("utf8", $string)>. The characters in
565			$string are encoded in Perl's internal format, and the result is returned
566			as a sequence of octets. Because all possible characters in Perl have a
567			(loose, not strict) UTF-8 representation, this function cannot fail.
568
569			=head3 decode_utf8
570
571			$string = decode_utf8($octets [, CHECK]);
572
573			Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
574			The sequence of octets represented by $octets is decoded
575			from UTF-8 into a sequence of logical characters.
576			Because not all sequences of octets are valid UTF-8,
577			it is quite possible for this function to fail.
578			For CHECK, see L</"Handling Malformed Data">.
579
580			=head2 Listing available encodings
581
582			use Encode;
583			@list = Encode->encodings();
584
585			Returns a list of canonical names of available encodings that have already
586			been loaded. To get a list of all available encodings including those that
587			have not yet been loaded, say:
588
589			@all_encodings = Encode->encodings(":all");
590
591			Or you can give the name of a specific module:
592
593			@with_jp = Encode->encodings("Encode::JP");
594
595			When "C<::>" is not in the name, "C<Encode::>" is assumed.
596
597			@ebcdic = Encode->encodings("EBCDIC");
598
599			To find out in detail which encodings are supported by this package,
600			see L<Encode::Supported>.
601
602			=head2 Defining Aliases
603
604			To add a new alias to a given encoding, use:
605
606			use Encode;
607			use Encode::Alias;
608			define_alias(NEWNAME => ENCODING);
609
610			After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
611			I<ENCODING> may be either the name of an encoding or an
612			I<encoding object>.
613
614			Before you do that, first make sure the alias is nonexistent using
615			C<resolve_alias()>, which returns the canonical name thereof.
616			For example:
617
618			Encode::resolve_alias("latin1") eq "iso-8859-1" # true
619			Encode::resolve_alias("iso-8859-12") # false; nonexistent
620			Encode::resolve_alias($name) eq $name # true if $name is canonical
621
622			C<resolve_alias()> does not need C<use Encode::Alias>; it can be
623			imported via C<use Encode qw(resolve_alias)>.
624
625			See L<Encode::Alias> for details.
626
627			=head2 Finding IANA Character Set Registry names
628
629			The canonical name of a given encoding does not necessarily agree with
630			IANA Character Set Registry, commonly seen as C<< Content-Type:
631			text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name
632			works, but sometimes it does not, most notably with "utf-8-strict".
633
634			As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
635
636			use Encode;
637			my $enc = find_encoding("UTF-8");
638			warn $enc->name; # utf-8-strict
639			warn $enc->mime_name; # UTF-8
640
641			See also: L<Encode::Encoding>
642
643			=head1 Encoding via PerlIO
644
645			If your perl supports C<PerlIO> (which is the default), you can use a
646			C<PerlIO> layer to decode and encode directly via a filehandle. The
647			following two examples are fully identical in functionality:
648
649			### Version 1 via PerlIO
650			open(INPUT, "< :encoding(shiftjis)", $infile)
651			\|\| die "Can't open < $infile for reading: $!";
652			open(OUTPUT, "> :encoding(euc-jp)", $outfile)
653			\|\| die "Can't open > $output for writing: $!";
654			while (<INPUT>) { # auto decodes $_
655			print OUTPUT; # auto encodes $_
656			}
657			close(INPUT) \|\| die "can't close $infile: $!";
658			close(OUTPUT) \|\| die "can't close $outfile: $!";
659
660			### Version 2 via from_to()
661			open(INPUT, "< :raw", $infile)
662			\|\| die "Can't open < $infile for reading: $!";
663			open(OUTPUT, "> :raw", $outfile)
664			\|\| die "Can't open > $output for writing: $!";
665
666			while (<INPUT>) {
667			from_to($_, "shiftjis", "euc-jp", 1); # switch encoding
668			print OUTPUT; # emit raw (but properly encoded) data
669			}
670			close(INPUT) \|\| die "can't close $infile: $!";
671			close(OUTPUT) \|\| die "can't close $outfile: $!";
672
673			In the first version above, you let the appropriate encoding layer
674			handle the conversion. In the second, you explicitly translate
675			from one encoding to the other.
676
677			Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check
678			to see whether your encoding is supported by C<PerlIO> by invoking the
679			C<perlio_ok> method on it:
680
681			Encode::perlio_ok("hz"); # false
682			find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available
683
684			use Encode qw(perlio_ok); # imported upon request
685			perlio_ok("euc-jp")
686
687			Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
688			except for C<hz> and C<ISO-2022-kr>. For the gory details, see
689			L<Encode::Encoding> and L<Encode::PerlIO>.
690
691			=head1 Handling Malformed Data
692
693			The optional I<CHECK> argument tells C<Encode> what to do when
694			encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT>
695			(== 0) is assumed.
696
697			As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
698			see below.
699
700			B<NOTE:> Not all encodings support this feature.
701			Some encodings ignore the I<CHECK> argument. For example,
702			L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
703
704			=head2 List of I<CHECK> values
705
706			=head3 FB_DEFAULT
707
708			I<CHECK> = Encode::FB_DEFAULT ( == 0)
709
710			If I<CHECK> is 0, encoding and decoding replace any malformed character
711			with a I<substitution character>. When you encode, I<SUBCHAR> is used.
712			When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
713			used. If the data is supposed to be UTF-8, an optional lexical warning of
714			warning category C<"utf8"> is given.
715
716			=head3 FB_CROAK
717
718			I<CHECK> = Encode::FB_CROAK ( == 1)
719
720			If I<CHECK> is 1, methods immediately die with an error
721			message. Therefore, when I<CHECK> is 1, you should trap
722			exceptions with C<eval{}>, unless you really want to let it C<die>.
723
724			=head3 FB_QUIET
725
726			I<CHECK> = Encode::FB_QUIET
727
728			If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
729			return the portion of the data that has been processed so far when an
730			error occurs. The data argument is overwritten with everything
731			after that point; that is, the unprocessed portion of the data. This is
732			handy when you have to call C<decode> repeatedly in the case where your
733			source data may contain partial multi-byte character sequences,
734			(that is, you are reading with a fixed-width buffer). Here's some sample
735			code to do exactly that:
736
737			my($buffer, $string) = ("", "");
738			while (read($fh, $buffer, 256, length($buffer))) {
739			$string .= decode($encoding, $buffer, Encode::FB_QUIET);
740			# $buffer now contains the unprocessed partial character
741			}
742
743			=head3 FB_WARN
744
745			I<CHECK> = Encode::FB_WARN
746
747			This is the same as C<FB_QUIET> above, except that instead of being silent
748			on errors, it issues a warning. This is handy for when you are debugging.
749
750			=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
751
752			=over 2
753
754			=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
755
756			=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
757
758			=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
759
760			=back
761
762			For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
763			C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
764
765			When you decode, C<\xI<HH>> is inserted for a malformed character, where
766			I<HH> is the hex representation of the octet that could not be decoded to
767			utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
768			the Unicode code point (in any number of hex digits) of the character that
769			cannot be found in the character repertoire of the encoding.
770
771			The HTML/XML character reference modes are about the same. In place of
772			C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
773			XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
774
775			In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
776
777			=head3 The bitmask
778
779			These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>>
780			constants are laid out. You can import the C<FB_I<XXX>> constants via
781			C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
782			constants via C<use Encode qw(:fallback_all)>.
783
784			FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
785			DIE_ON_ERR 0x0001 X
786			WARN_ON_ERR 0x0002 X
787			RETURN_ON_ERR 0x0004 X X
788			LEAVE_SRC 0x0008 X
789			PERLQQ 0x0100 X
790			HTMLCREF 0x0200
791			XMLCREF 0x0400
792
793			=head3 LEAVE_SRC
794
795			Encode::LEAVE_SRC
796
797			If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
798			source string to encode() or decode() will be overwritten in place.
799			If you're not interested in this, then bitwise-OR it with the bitmask.
800
801			=head2 coderef for CHECK
802
803			As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
804			ordinal value of the unmapped character as an argument and returns
805			octets that represent the fallback character. For instance:
806
807			$ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
808
809			Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
810
811			Even the fallback for C<decode> must return octets, which are
812			then decoded with the character encoding that C<decode> accepts. So for
813			example if you wish to decode octests as UTF-8, and use ISO-8859-15 as
814			a fallback for bytes that are not valid UTF-8, you could write
815
816			$str = decode 'UTF-8', $octets, sub {
817			my $tmp = chr shift;
818			from_to $tmp, 'ISO-8859-15', 'UTF-8';
819			return $tmp;
820			};
821
822			=head1 Defining Encodings
823
824			To define a new encoding, use:
825
826			use Encode qw(define_encoding);
827			define_encoding($object, CANONICAL_NAME [, alias...]);
828
829			I<CANONICAL_NAME> will be associated with I<$object>. The object
830			should provide the interface described in L<Encode::Encoding>.
831			If more than two arguments are provided, additional
832			arguments are considered aliases for I<$object>.
833
834			See L<Encode::Encoding> for details.
835
836			=head1 The UTF8 flag
837
838			Before the introduction of Unicode support in Perl, The C<eq> operator
839			just compared the strings represented by two scalars. Beginning with
840			Perl 5.8, C<eq> compares two strings with simultaneous consideration of
841			I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
842			I<Programming Perl, 3rd ed.>
843
844			=over 2
845
846			=item Goal #1:
847
848			Old byte-oriented programs should not spontaneously break on the old
849			byte-oriented data they used to work on.
850
851			=item Goal #2:
852
853			Old byte-oriented programs should magically start working on the new
854			character-oriented data when appropriate.
855
856			=item Goal #3:
857
858			Programs should run just as fast in the new character-oriented mode
859			as in the old byte-oriented mode.
860
861			=item Goal #4:
862
863			Perl should remain one language, rather than forking into a
864			byte-oriented Perl and a character-oriented Perl.
865
866			=back
867
868			When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
869			born yet, many features documented in the book remained unimplemented for a
870			long time. Perl 5.8 corrected much of this, and the introduction of the
871			UTF8 flag is one of them. You can think of there being two fundamentally
872			different kinds of strings and string-operations in Perl: one a
873			byte-oriented mode for when the internal UTF8 flag is off, and the other a
874			character-oriented mode for when the internal UTF8 flag is on.
875
876			Here is how C<Encode> handles the UTF8 flag.
877
878			=over 2
879
880			=item *
881
882			When you I<encode>, the resulting UTF8 flag is always B<off>.
883
884			=item *
885
886			When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
887			unambiguously represent data. Here is what we mean by "unambiguously".
888			After C<$utf8 = decode("foo", $octet)>,
889
890			When $octet is... The UTF8 flag in $utf8 is
891			---------------------------------------------
892			In ASCII only (or EBCDIC only) OFF
893			In ISO-8859-1 ON
894			In any other Encoding ON
895			---------------------------------------------
896
897			As you see, there is one exception: in ASCII. That way you can assume
898			Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be
899			careful in the cases mentioned in the B<CAVEAT> paragraphs above.
900
901			This UTF8 flag is not visible in Perl scripts, exactly for the same reason
902			you cannot (or rather, you I<don't have to>) see whether a scalar contains
903			a string, an integer, or a floating-point number. But you can still peek
904			and poke these if you will. See the next section.
905
906			=back
907
908			=head2 Messing with Perl's Internals
909
910			The following API uses parts of Perl's internals in the current
911			implementation. As such, they are efficient but may change in a future
912			release.
913
914			=head3 is_utf8
915
916			is_utf8(STRING [, CHECK])
917
918			[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
919			If I<CHECK> is true, also checks whether I<STRING> contains well-formed
920			UTF-8. Returns true if successful, false otherwise.
921
922			As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
923
924			=head3 _utf8_on
925
926			_utf8_on(STRING)
927
928			[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING>
929			is I<not> checked for containing only well-formed UTF-8. Do not use this
930			unless you I<know with absolute certainty> that the STRING holds only
931			well-formed UTF-8. Returns the previous state of the UTF8 flag (so please
932			don't treat the return value as indicating success or failure), or C<undef>
933			if I<STRING> is not a string.
934
935			B<NOTE>: For security reasons, this function does not work on tainted values.
936
937			=head3 _utf8_off
938
939			_utf8_off(STRING)
940
941			[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use
942			frivolously. Returns the previous state of the UTF8 flag, or C<undef> if
943			I<STRING> is not a string. Do not treat the return value as indicative of
944			success or failure, because that isn't what it means: it is only the
945			previous setting.
946
947			B<NOTE>: For security reasons, this function does not work on tainted values.
948
949			=head1 UTF-8 vs. utf8 vs. UTF8
950
951			....We now view strings not as sequences of bytes, but as sequences
952			of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
953			computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
954
955			That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
956			first conceived by Ken Thompson when he invented it. However, thanks to
957			later revisions to the applicable standards, official UTF-8 is now rather
958			stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
959			to cover only 21 bits instead of 32 or 64 bits) and some sequences
960			are not allowed, like those used in surrogate pairs, the 31 non-character
961			code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
962			(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
963
964			The former default in which Perl would always use a loose interpretation of
965			UTF-8 has now been overruled:
966
967			From: Larry Wall <larry@wall.org>
968			Date: December 04, 2004 11:51:58 JST
969			To: perl-unicode@perl.org
970			Subject: Re: Make Encode.pm support the real UTF-8
971			Message-Id: <20041204025158.GA28754@wall.org>
972
973			On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
974			: I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
975			: but "UTF-8" is the name of the standard and should give the
976			: corresponding behaviour.
977
978			For what it's worth, that's how I've always kept them straight in my
979			head.
980
981			Also for what it's worth, Perl 6 will mostly default to strict but
982			make it easy to switch back to lax.
983
984			Larry
985
986			Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
987			sense, which is conservative and strict and security-conscious, whereas
988			B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
989			lax. C<Encode> version 2.10 or later thus groks this subtle but critically
990			important distinction between C<"UTF-8"> and C<"utf8">.
991
992			encode("utf8", "\x{FFFF_FFFF}", 1); # okay
993			encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
994
995			In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
996			C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is
997			critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
998
999			find_encoding("UTF-8")->name # is 'utf-8-strict'
1000			find_encoding("utf-8")->name # ditto. names are case insensitive
1001			find_encoding("utf_8")->name # ditto. "_" are treated as "-"
1002			find_encoding("UTF8")->name # is 'utf8'.
1003
1004			Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
1005			whether a string is internally encoded as "utf8", also without a hyphen.
1006
1007			=head1 SEE ALSO
1008
1009			L<Encode::Encoding>,
1010			L<Encode::Supported>,
1011			L<Encode::PerlIO>,
1012			L<encoding>,
1013			L<perlebcdic>,
1014			L<perlfunc/open>,
1015			L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
1016			L<utf8>,
1017			the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
1018
1019			=head1 MAINTAINER
1020
1021			This project was originated by the late Nick Ing-Simmons and later
1022			maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS
1023			for a full list of people involved. For any questions, send mail to
1024			I<< <perl-unicode@perl.org> >> so that we can all share.
1025
1026			While Dan Kogai retains the copyright as a maintainer, credit
1027			should go to all those involved. See AUTHORS for a list of those
1028			who submitted code to the project.
1029
1030			=head1 COPYRIGHT
1031
1032			Copyright 2002-2013 Dan Kogai I<< <dankogai@cpan.org> >>.
1033
1034			This library is free software; you can redistribute it and/or modify
1035			it under the same terms as Perl itself.
1036
1037			=cut