← Index
NYTProf Performance Profile   « line view »
For starman worker -M FindBin --max-requests 50 --workers 2 --user=kohadev-koha --group kohadev-koha --pid /var/run/koha/kohadev/plack.pid --daemonize --access-log /var/log/koha/kohadev/plack.log --error-log /var/log/koha/kohadev/plack-error.log -E deployment --socket /var/run/koha/kohadev/plack.sock /etc/koha/sites/kohadev/plack.psgi
  Run on Fri Jan 8 14:31:06 2016
Reported on Fri Jan 8 14:31:39 2016

Filename/usr/lib/x86_64-linux-gnu/perl/5.20/Encode.pm
StatementsExecuted 0 statements in 0s
Line State
ments
Time
on line
Calls Time
in subs
Code
1#
2# $Id: Encode.pm,v 2.60 2014/04/29 16:26:49 dankogai Exp dankogai $
3#
4package Encode;
5use strict;
6use warnings;
7our $VERSION = sprintf "%d.%02d", q$Revision: 2.60_01 $ =~ /(\d+)/g;
8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
9use XSLoader ();
10XSLoader::load( __PACKAGE__, $VERSION );
11
12use Exporter 5.57 'import';
13
14# Public, encouraged API is exported by default
15
16our @EXPORT = qw(
17 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
18 encodings find_encoding clone_encoding
19);
20our @FB_FLAGS = qw(
21 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
23);
24our @FB_CONSTS = qw(
25 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
26 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
27);
28our @EXPORT_OK = (
29 qw(
30 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
31 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
32 ),
33 @FB_FLAGS, @FB_CONSTS,
34);
35
36our %EXPORT_TAGS = (
37 all => [ @EXPORT, @EXPORT_OK ],
38 default => [ @EXPORT ],
39 fallbacks => [ @FB_CONSTS ],
40 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
41);
42
43# Documentation moved after __END__ for speed - NI-S
44
45our $ON_EBCDIC = ( ord("A") == 193 );
46
47use Encode::Alias;
48
49# Make a %Encoding package variable to allow a certain amount of cheating
50our %Encoding;
51our %ExtModule;
52require Encode::Config;
53# See
54# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
55# to find why sig handlers inside eval{} are disabled.
56eval {
57 local $SIG{__DIE__};
58 local $SIG{__WARN__};
59 require Encode::ConfigLocal;
60};
61
62sub encodings {
63 my %enc;
64 my $arg = $_[1] || '';
65 if ( $arg eq ":all" ) {
66 %enc = ( %Encoding, %ExtModule );
67 }
68 else {
69 %enc = %Encoding;
70 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
71 DEBUG and warn $mod;
72 for my $enc ( keys %ExtModule ) {
73 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
74 }
75 }
76 }
77 return sort { lc $a cmp lc $b }
78 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
79}
80
81sub perlio_ok {
82 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
83 $obj->can("perlio_ok") and return $obj->perlio_ok();
84 return 0; # safety net
85}
86
87sub define_encoding {
88 my $obj = shift;
89 my $name = shift;
90 $Encoding{$name} = $obj;
91 my $lc = lc($name);
92 define_alias( $lc => $obj ) unless $lc eq $name;
93 while (@_) {
94 my $alias = shift;
95 define_alias( $alias, $obj );
96 }
97 return $obj;
98}
99
100sub getEncoding {
101 my ( $class, $name, $skip_external ) = @_;
102
103 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
104
105 ref($name) && $name->can('renew') and return $name;
106 exists $Encoding{$name} and return $Encoding{$name};
107 my $lc = lc $name;
108 exists $Encoding{$lc} and return $Encoding{$lc};
109
110 my $oc = $class->find_alias($name);
111 defined($oc) and return $oc;
112 $lc ne $name and $oc = $class->find_alias($lc);
113 defined($oc) and return $oc;
114
115 unless ($skip_external) {
116 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
117 $mod =~ s,::,/,g;
118 $mod .= '.pm';
119 eval { require $mod; };
120 exists $Encoding{$name} and return $Encoding{$name};
121 }
122 }
123 return;
124}
125
126sub find_encoding($;$) {
127 my ( $name, $skip_external ) = @_;
128 return __PACKAGE__->getEncoding( $name, $skip_external );
129}
130
131sub resolve_alias($) {
132 my $obj = find_encoding(shift);
133 defined $obj and return $obj->name;
134 return;
135}
136
137sub clone_encoding($) {
138 my $obj = find_encoding(shift);
139 ref $obj or return;
140 eval { require Storable };
141 $@ and return;
142 return Storable::dclone($obj);
143}
144
145sub encode($$;$) {
146 my ( $name, $string, $check ) = @_;
147 return undef unless defined $string;
148 $string .= ''; # stringify;
149 $check ||= 0;
150 unless ( defined $name ) {
151 require Carp;
152 Carp::croak("Encoding name should not be undef");
153 }
154 my $enc = find_encoding($name);
155 unless ( defined $enc ) {
156 require Carp;
157 Carp::croak("Unknown encoding '$name'");
158 }
159 my $octets = $enc->encode( $string, $check );
160 $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
161 return $octets;
162}
163*str2bytes = \&encode;
164
165sub decode($$;$) {
166 my ( $name, $octets, $check ) = @_;
167 return undef unless defined $octets;
168 $octets .= '';
169 $check ||= 0;
170 my $enc = find_encoding($name);
171 unless ( defined $enc ) {
172 require Carp;
173 Carp::croak("Unknown encoding '$name'");
174 }
1752046µs my $string = $enc->decode( $octets, $check );
# spent 46µs making 20 calls to Encode::Encoding::renewed, avg 2µs/call
176 $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
177 return $string;
178}
179*bytes2str = \&decode;
180
181sub from_to($$$;$) {
182 my ( $string, $from, $to, $check ) = @_;
183 return undef unless defined $string;
184 $check ||= 0;
185 my $f = find_encoding($from);
186 unless ( defined $f ) {
187 require Carp;
188 Carp::croak("Unknown encoding '$from'");
189 }
190 my $t = find_encoding($to);
191 unless ( defined $t ) {
192 require Carp;
193 Carp::croak("Unknown encoding '$to'");
194 }
195 my $uni = $f->decode($string);
196 $_[0] = $string = $t->encode( $uni, $check );
197 return undef if ( $check && length($uni) );
198 return defined( $_[0] ) ? length($string) : undef;
199}
200
201sub encode_utf8($) {
202 my ($str) = @_;
203 utf8::encode($str);
204 return $str;
205}
206
207my $utf8enc;
208
209sub decode_utf8($;$) {
210 my ( $octets, $check ) = @_;
211 return undef unless defined $octets;
212 $octets .= '';
213 $check ||= 0;
214 $utf8enc ||= find_encoding('utf8');
215 my $string = $utf8enc->decode( $octets, $check );
216 $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
217 return $string;
218}
219
220# sub decode_utf8($;$) {
221# my ( $str, $check ) = @_;
222# return $str if is_utf8($str);
223# if ($check) {
224# return decode( "utf8", $str, $check );
225# }
226# else {
227# return decode( "utf8", $str );
228# return $str;
229# }
230# }
231
232predefine_encodings(1);
233
234#
235# This is to restore %Encoding if really needed;
236#
237
238sub predefine_encodings {
239 require Encode::Encoding;
240 no warnings 'redefine';
241 my $use_xs = shift;
242 if ($ON_EBCDIC) {
243
244 # was in Encode::UTF_EBCDIC
245 package Encode::UTF_EBCDIC;
246 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
247 *decode = sub {
248 my ( undef, $str, $chk ) = @_;
249 my $res = '';
250 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
251 $res .=
252 chr(
253 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
254 );
255 }
256 $_[1] = '' if $chk;
257 return $res;
258 };
259 *encode = sub {
260 my ( undef, $str, $chk ) = @_;
261 my $res = '';
262 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
263 $res .=
264 chr(
265 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
266 );
267 }
268 $_[1] = '' if $chk;
269 return $res;
270 };
271 $Encode::Encoding{Unicode} =
272 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
273 }
274 else {
275
276 package Encode::Internal;
277 push @Encode::Internal::ISA, 'Encode::Encoding';
278 *decode = sub {
279 my ( undef, $str, $chk ) = @_;
280 utf8::upgrade($str);
281 $_[1] = '' if $chk;
282 return $str;
283 };
284 *encode = \&decode;
285 $Encode::Encoding{Unicode} =
286 bless { Name => "Internal" } => "Encode::Internal";
287 }
288
289 {
290
291 # was in Encode::utf8
292 package Encode::utf8;
293 push @Encode::utf8::ISA, 'Encode::Encoding';
294
295 #
296 if ($use_xs) {
297 Encode::DEBUG and warn __PACKAGE__, " XS on";
298 *decode = \&decode_xs;
299 *encode = \&encode_xs;
300 }
301 else {
302 Encode::DEBUG and warn __PACKAGE__, " XS off";
303 *decode = sub {
304 my ( undef, $octets, $chk ) = @_;
305 my $str = Encode::decode_utf8($octets);
306 if ( defined $str ) {
307 $_[1] = '' if $chk;
308 return $str;
309 }
310 return undef;
311 };
312 *encode = sub {
313 my ( undef, $string, $chk ) = @_;
314 my $octets = Encode::encode_utf8($string);
315 $_[1] = '' if $chk;
316 return $octets;
317 };
318 }
319 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
320 # currently ignores $chk
321 my ( undef, undef, undef, $pos, $trm ) = @_;
322 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
323 use bytes;
324 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
325 $$rdst .=
326 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
327 $$rpos = $npos + length($trm);
328 return 1;
329 }
330 $$rdst .= substr( $$rsrc, $pos );
331 $$rpos = length($$rsrc);
332 return '';
333 };
334 $Encode::Encoding{utf8} =
335 bless { Name => "utf8" } => "Encode::utf8";
336 $Encode::Encoding{"utf-8-strict"} =
337 bless { Name => "utf-8-strict", strict_utf8 => 1 }
338 => "Encode::utf8";
339 }
340}
341
3421;
343
344__END__
345
346=head1 NAME
347
348Encode - character encodings in Perl
349
350=head1 SYNOPSIS
351
352 use Encode qw(decode encode);
353 $characters = decode('UTF-8', $octets, Encode::FB_CROAK);
354 $octets = encode('UTF-8', $characters, Encode::FB_CROAK);
355
356=head2 Table of Contents
357
358Encode consists of a collection of modules whose details are too extensive
359to fit in one document. This one itself explains the top-level APIs
360and general topics at a glance. For other topics and more details,
361see the documentation for these modules:
362
363=over 2
364
365=item L<Encode::Alias> - Alias definitions to encodings
366
367=item L<Encode::Encoding> - Encode Implementation Base Class
368
369=item L<Encode::Supported> - List of Supported Encodings
370
371=item L<Encode::CN> - Simplified Chinese Encodings
372
373=item L<Encode::JP> - Japanese Encodings
374
375=item L<Encode::KR> - Korean Encodings
376
377=item L<Encode::TW> - Traditional Chinese Encodings
378
379=back
380
381=head1 DESCRIPTION
382
383The C<Encode> module provides the interface between Perl strings
384and the rest of the system. Perl strings are sequences of
385I<characters>.
386
387The repertoire of characters that Perl can represent is a superset of those
388defined by the Unicode Consortium. On most platforms the ordinal
389values of a character as returned by C<ord(I<S>)> is the I<Unicode
390codepoint> for that character. The exceptions are platforms where
391the legacy encoding is some variant of EBCDIC rather than a superset
392of ASCII; see L<perlebcdic>.
393
394During recent history, data is moved around a computer in 8-bit chunks,
395often called "bytes" but also known as "octets" in standards documents.
396Perl is widely used to manipulate data of many types: not only strings of
397characters representing human or computer languages, but also "binary"
398data, being the machine's representation of numbers, pixels in an image, or
399just about anything.
400
401When Perl is processing "binary data", the programmer wants Perl to
402process "sequences of bytes". This is not a problem for Perl: because a
403byte has 256 possible values, it easily fits in Perl's much larger
404"logical character".
405
406This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
407explain the I<why>.
408
409=head2 TERMINOLOGY
410
411=head3 character
412
413A character in the range 0 .. 2**32-1 (or more);
414what Perl's strings are made of.
415
416=head3 byte
417
418A character in the range 0..255;
419a special case of a Perl character.
420
421=head3 octet
422
4238 bits of data, with ordinal values 0..255;
424term for bytes passed to or from a non-Perl context, such as a disk file,
425standard I/O stream, database, command-line argument, environment variable,
426socket etc.
427
428=head1 THE PERL ENCODING API
429
430=head2 Basic methods
431
432=head3 encode
433
434 $octets = encode(ENCODING, STRING[, CHECK])
435
436Encodes the scalar value I<STRING> from Perl's internal form into
437I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a
438canonical name or an alias. For encoding names and aliases, see
439L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">.
440
441For example, to convert a string from Perl's internal format into
442ISO-8859-1, also known as Latin1:
443
444 $octets = encode("iso-8859-1", $string);
445
446B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
447$octets I<might not be equal to> $string. Though both contain the
448same data, the UTF8 flag for $octets is I<always> off. When you
449encode anything, the UTF8 flag on the result is always off, even when it
450contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
451
452If the $string is C<undef>, then C<undef> is returned.
453
454=head3 decode
455
456 $string = decode(ENCODING, OCTETS[, CHECK])
457
458This function returns the string that results from decoding the scalar
459value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
460Perl's internal form. The returns the resulting string. As with encode(),
461I<ENCODING> can be either a canonical name or an alias. For encoding names
462and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
463Malformed Data">.
464
465For example, to convert ISO-8859-1 data into a string in Perl's
466internal format:
467
468 $string = decode("iso-8859-1", $octets);
469
470B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
471I<might not be equal to> $octets. Though both contain the same data, the
472UTF8 flag for $string is on. See L</"The UTF8 flag">
473below.
474
475If the $string is C<undef>, then C<undef> is returned.
476
477=head3 find_encoding
478
479 [$obj =] find_encoding(ENCODING)
480
481Returns the I<encoding object> corresponding to I<ENCODING>. Returns
482C<undef> if no matching I<ENCODING> is find. The returned object is
483what does the actual encoding or decoding.
484
485 $utf8 = decode($name, $bytes);
486
487is in fact
488
489 $utf8 = do {
490 $obj = find_encoding($name);
491 croak qq(encoding "$name" not found) unless ref $obj;
492 $obj->decode($bytes);
493 };
494
495with more error checking.
496
497You can therefore save time by reusing this object as follows;
498
499 my $enc = find_encoding("iso-8859-1");
500 while(<>) {
501 my $utf8 = $enc->decode($_);
502 ... # now do something with $utf8;
503 }
504
505Besides L</decode> and L</encode>, other methods are
506available as well. For instance, C<name()> returns the canonical
507name of the encoding object.
508
509 find_encoding("latin1")->name; # iso-8859-1
510
511See L<Encode::Encoding> for details.
512
513=head3 from_to
514
515 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
516
517Converts I<in-place> data between two encodings. The data in $octets
518must be encoded as octets and I<not> as characters in Perl's internal
519format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
520encoding:
521
522 from_to($octets, "iso-8859-1", "cp1250");
523
524and to convert it back:
525
526 from_to($octets, "cp1250", "iso-8859-1");
527
528Because the conversion happens in place, the data to be
529converted cannot be a string constant: it must be a scalar variable.
530
531C<from_to()> returns the length of the converted string in octets on success,
532and C<undef> on error.
533
534B<CAVEAT>: The following operations may look the same, but are not:
535
536 from_to($data, "iso-8859-1", "utf8"); #1
537 $data = decode("iso-8859-1", $data); #2
538
539Both #1 and #2 make $data consist of a completely valid UTF-8 string,
540but only #2 turns the UTF8 flag on. #1 is equivalent to:
541
542 $data = encode("utf8", decode("iso-8859-1", $data));
543
544See L</"The UTF8 flag"> below.
545
546Also note that:
547
548 from_to($octets, $from, $to, $check);
549
550is equivalent t:o
551
552 $octets = encode($to, decode($from, $octets), $check);
553
554Yes, it does I<not> respect the $check during decoding. It is
555deliberately done that way. If you need minute control, use C<decode>
556followed by C<encode> as follows:
557
558 $octets = encode($to, decode($from, $octets, $check_from), $check_to);
559
560=head3 encode_utf8
561
562 $octets = encode_utf8($string);
563
564Equivalent to C<$octets = encode("utf8", $string)>. The characters in
565$string are encoded in Perl's internal format, and the result is returned
566as a sequence of octets. Because all possible characters in Perl have a
567(loose, not strict) UTF-8 representation, this function cannot fail.
568
569=head3 decode_utf8
570
571 $string = decode_utf8($octets [, CHECK]);
572
573Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
574The sequence of octets represented by $octets is decoded
575from UTF-8 into a sequence of logical characters.
576Because not all sequences of octets are valid UTF-8,
577it is quite possible for this function to fail.
578For CHECK, see L</"Handling Malformed Data">.
579
580=head2 Listing available encodings
581
582 use Encode;
583 @list = Encode->encodings();
584
585Returns a list of canonical names of available encodings that have already
586been loaded. To get a list of all available encodings including those that
587have not yet been loaded, say:
588
589 @all_encodings = Encode->encodings(":all");
590
591Or you can give the name of a specific module:
592
593 @with_jp = Encode->encodings("Encode::JP");
594
595When "C<::>" is not in the name, "C<Encode::>" is assumed.
596
597 @ebcdic = Encode->encodings("EBCDIC");
598
599To find out in detail which encodings are supported by this package,
600see L<Encode::Supported>.
601
602=head2 Defining Aliases
603
604To add a new alias to a given encoding, use:
605
606 use Encode;
607 use Encode::Alias;
608 define_alias(NEWNAME => ENCODING);
609
610After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
611I<ENCODING> may be either the name of an encoding or an
612I<encoding object>.
613
614Before you do that, first make sure the alias is nonexistent using
615C<resolve_alias()>, which returns the canonical name thereof.
616For example:
617
618 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
619 Encode::resolve_alias("iso-8859-12") # false; nonexistent
620 Encode::resolve_alias($name) eq $name # true if $name is canonical
621
622C<resolve_alias()> does not need C<use Encode::Alias>; it can be
623imported via C<use Encode qw(resolve_alias)>.
624
625See L<Encode::Alias> for details.
626
627=head2 Finding IANA Character Set Registry names
628
629The canonical name of a given encoding does not necessarily agree with
630IANA Character Set Registry, commonly seen as C<< Content-Type:
631text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name
632works, but sometimes it does not, most notably with "utf-8-strict".
633
634As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
635
636 use Encode;
637 my $enc = find_encoding("UTF-8");
638 warn $enc->name; # utf-8-strict
639 warn $enc->mime_name; # UTF-8
640
641See also: L<Encode::Encoding>
642
643=head1 Encoding via PerlIO
644
645If your perl supports C<PerlIO> (which is the default), you can use a
646C<PerlIO> layer to decode and encode directly via a filehandle. The
647following two examples are fully identical in functionality:
648
649 ### Version 1 via PerlIO
650 open(INPUT, "< :encoding(shiftjis)", $infile)
651 || die "Can't open < $infile for reading: $!";
652 open(OUTPUT, "> :encoding(euc-jp)", $outfile)
653 || die "Can't open > $output for writing: $!";
654 while (<INPUT>) { # auto decodes $_
655 print OUTPUT; # auto encodes $_
656 }
657 close(INPUT) || die "can't close $infile: $!";
658 close(OUTPUT) || die "can't close $outfile: $!";
659
660 ### Version 2 via from_to()
661 open(INPUT, "< :raw", $infile)
662 || die "Can't open < $infile for reading: $!";
663 open(OUTPUT, "> :raw", $outfile)
664 || die "Can't open > $output for writing: $!";
665
666 while (<INPUT>) {
667 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding
668 print OUTPUT; # emit raw (but properly encoded) data
669 }
670 close(INPUT) || die "can't close $infile: $!";
671 close(OUTPUT) || die "can't close $outfile: $!";
672
673In the first version above, you let the appropriate encoding layer
674handle the conversion. In the second, you explicitly translate
675from one encoding to the other.
676
677Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check
678to see whether your encoding is supported by C<PerlIO> by invoking the
679C<perlio_ok> method on it:
680
681 Encode::perlio_ok("hz"); # false
682 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available
683
684 use Encode qw(perlio_ok); # imported upon request
685 perlio_ok("euc-jp")
686
687Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
688except for C<hz> and C<ISO-2022-kr>. For the gory details, see
689L<Encode::Encoding> and L<Encode::PerlIO>.
690
691=head1 Handling Malformed Data
692
693The optional I<CHECK> argument tells C<Encode> what to do when
694encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT>
695(== 0) is assumed.
696
697As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
698see below.
699
700B<NOTE:> Not all encodings support this feature.
701Some encodings ignore the I<CHECK> argument. For example,
702L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
703
704=head2 List of I<CHECK> values
705
706=head3 FB_DEFAULT
707
708 I<CHECK> = Encode::FB_DEFAULT ( == 0)
709
710If I<CHECK> is 0, encoding and decoding replace any malformed character
711with a I<substitution character>. When you encode, I<SUBCHAR> is used.
712When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
713used. If the data is supposed to be UTF-8, an optional lexical warning of
714warning category C<"utf8"> is given.
715
716=head3 FB_CROAK
717
718 I<CHECK> = Encode::FB_CROAK ( == 1)
719
720If I<CHECK> is 1, methods immediately die with an error
721message. Therefore, when I<CHECK> is 1, you should trap
722exceptions with C<eval{}>, unless you really want to let it C<die>.
723
724=head3 FB_QUIET
725
726 I<CHECK> = Encode::FB_QUIET
727
728If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
729return the portion of the data that has been processed so far when an
730error occurs. The data argument is overwritten with everything
731after that point; that is, the unprocessed portion of the data. This is
732handy when you have to call C<decode> repeatedly in the case where your
733source data may contain partial multi-byte character sequences,
734(that is, you are reading with a fixed-width buffer). Here's some sample
735code to do exactly that:
736
737 my($buffer, $string) = ("", "");
738 while (read($fh, $buffer, 256, length($buffer))) {
739 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
740 # $buffer now contains the unprocessed partial character
741 }
742
743=head3 FB_WARN
744
745 I<CHECK> = Encode::FB_WARN
746
747This is the same as C<FB_QUIET> above, except that instead of being silent
748on errors, it issues a warning. This is handy for when you are debugging.
749
750=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
751
752=over 2
753
754=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
755
756=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
757
758=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
759
760=back
761
762For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
763C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
764
765When you decode, C<\xI<HH>> is inserted for a malformed character, where
766I<HH> is the hex representation of the octet that could not be decoded to
767utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
768the Unicode code point (in any number of hex digits) of the character that
769cannot be found in the character repertoire of the encoding.
770
771The HTML/XML character reference modes are about the same. In place of
772C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
773XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
774
775In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
776
777=head3 The bitmask
778
779These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>>
780constants are laid out. You can import the C<FB_I<XXX>> constants via
781C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
782constants via C<use Encode qw(:fallback_all)>.
783
784 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
785 DIE_ON_ERR 0x0001 X
786 WARN_ON_ERR 0x0002 X
787 RETURN_ON_ERR 0x0004 X X
788 LEAVE_SRC 0x0008 X
789 PERLQQ 0x0100 X
790 HTMLCREF 0x0200
791 XMLCREF 0x0400
792
793=head3 LEAVE_SRC
794
795 Encode::LEAVE_SRC
796
797If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
798source string to encode() or decode() will be overwritten in place.
799If you're not interested in this, then bitwise-OR it with the bitmask.
800
801=head2 coderef for CHECK
802
803As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
804ordinal value of the unmapped character as an argument and returns
805octets that represent the fallback character. For instance:
806
807 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
808
809Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
810
811Even the fallback for C<decode> must return octets, which are
812then decoded with the character encoding that C<decode> accepts. So for
813example if you wish to decode octests as UTF-8, and use ISO-8859-15 as
814a fallback for bytes that are not valid UTF-8, you could write
815
816 $str = decode 'UTF-8', $octets, sub {
817 my $tmp = chr shift;
818 from_to $tmp, 'ISO-8859-15', 'UTF-8';
819 return $tmp;
820 };
821
822=head1 Defining Encodings
823
824To define a new encoding, use:
825
826 use Encode qw(define_encoding);
827 define_encoding($object, CANONICAL_NAME [, alias...]);
828
829I<CANONICAL_NAME> will be associated with I<$object>. The object
830should provide the interface described in L<Encode::Encoding>.
831If more than two arguments are provided, additional
832arguments are considered aliases for I<$object>.
833
834See L<Encode::Encoding> for details.
835
836=head1 The UTF8 flag
837
838Before the introduction of Unicode support in Perl, The C<eq> operator
839just compared the strings represented by two scalars. Beginning with
840Perl 5.8, C<eq> compares two strings with simultaneous consideration of
841I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
842I<Programming Perl, 3rd ed.>
843
844=over 2
845
846=item Goal #1:
847
848Old byte-oriented programs should not spontaneously break on the old
849byte-oriented data they used to work on.
850
851=item Goal #2:
852
853Old byte-oriented programs should magically start working on the new
854character-oriented data when appropriate.
855
856=item Goal #3:
857
858Programs should run just as fast in the new character-oriented mode
859as in the old byte-oriented mode.
860
861=item Goal #4:
862
863Perl should remain one language, rather than forking into a
864byte-oriented Perl and a character-oriented Perl.
865
866=back
867
868When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
869born yet, many features documented in the book remained unimplemented for a
870long time. Perl 5.8 corrected much of this, and the introduction of the
871UTF8 flag is one of them. You can think of there being two fundamentally
872different kinds of strings and string-operations in Perl: one a
873byte-oriented mode for when the internal UTF8 flag is off, and the other a
874character-oriented mode for when the internal UTF8 flag is on.
875
876Here is how C<Encode> handles the UTF8 flag.
877
878=over 2
879
880=item *
881
882When you I<encode>, the resulting UTF8 flag is always B<off>.
883
884=item *
885
886When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
887unambiguously represent data. Here is what we mean by "unambiguously".
888After C<$utf8 = decode("foo", $octet)>,
889
890 When $octet is... The UTF8 flag in $utf8 is
891 ---------------------------------------------
892 In ASCII only (or EBCDIC only) OFF
893 In ISO-8859-1 ON
894 In any other Encoding ON
895 ---------------------------------------------
896
897As you see, there is one exception: in ASCII. That way you can assume
898Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be
899careful in the cases mentioned in the B<CAVEAT> paragraphs above.
900
901This UTF8 flag is not visible in Perl scripts, exactly for the same reason
902you cannot (or rather, you I<don't have to>) see whether a scalar contains
903a string, an integer, or a floating-point number. But you can still peek
904and poke these if you will. See the next section.
905
906=back
907
908=head2 Messing with Perl's Internals
909
910The following API uses parts of Perl's internals in the current
911implementation. As such, they are efficient but may change in a future
912release.
913
914=head3 is_utf8
915
916 is_utf8(STRING [, CHECK])
917
918[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
919If I<CHECK> is true, also checks whether I<STRING> contains well-formed
920UTF-8. Returns true if successful, false otherwise.
921
922As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
923
924=head3 _utf8_on
925
926 _utf8_on(STRING)
927
928[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING>
929is I<not> checked for containing only well-formed UTF-8. Do not use this
930unless you I<know with absolute certainty> that the STRING holds only
931well-formed UTF-8. Returns the previous state of the UTF8 flag (so please
932don't treat the return value as indicating success or failure), or C<undef>
933if I<STRING> is not a string.
934
935B<NOTE>: For security reasons, this function does not work on tainted values.
936
937=head3 _utf8_off
938
939 _utf8_off(STRING)
940
941[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use
942frivolously. Returns the previous state of the UTF8 flag, or C<undef> if
943I<STRING> is not a string. Do not treat the return value as indicative of
944success or failure, because that isn't what it means: it is only the
945previous setting.
946
947B<NOTE>: For security reasons, this function does not work on tainted values.
948
949=head1 UTF-8 vs. utf8 vs. UTF8
950
951 ....We now view strings not as sequences of bytes, but as sequences
952 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
953 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
954
955That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
956first conceived by Ken Thompson when he invented it. However, thanks to
957later revisions to the applicable standards, official UTF-8 is now rather
958stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
959to cover only 21 bits instead of 32 or 64 bits) and some sequences
960are not allowed, like those used in surrogate pairs, the 31 non-character
961code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
962(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
963
964The former default in which Perl would always use a loose interpretation of
965UTF-8 has now been overruled:
966
967 From: Larry Wall <larry@wall.org>
968 Date: December 04, 2004 11:51:58 JST
969 To: perl-unicode@perl.org
970 Subject: Re: Make Encode.pm support the real UTF-8
971 Message-Id: <20041204025158.GA28754@wall.org>
972
973 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
974 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
975 : but "UTF-8" is the name of the standard and should give the
976 : corresponding behaviour.
977
978 For what it's worth, that's how I've always kept them straight in my
979 head.
980
981 Also for what it's worth, Perl 6 will mostly default to strict but
982 make it easy to switch back to lax.
983
984 Larry
985
986Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
987sense, which is conservative and strict and security-conscious, whereas
988B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
989lax. C<Encode> version 2.10 or later thus groks this subtle but critically
990important distinction between C<"UTF-8"> and C<"utf8">.
991
992 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
993 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
994
995In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
996C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is
997critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
998
999 find_encoding("UTF-8")->name # is 'utf-8-strict'
1000 find_encoding("utf-8")->name # ditto. names are case insensitive
1001 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
1002 find_encoding("UTF8")->name # is 'utf8'.
1003
1004Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
1005whether a string is internally encoded as "utf8", also without a hyphen.
1006
1007=head1 SEE ALSO
1008
1009L<Encode::Encoding>,
1010L<Encode::Supported>,
1011L<Encode::PerlIO>,
1012L<encoding>,
1013L<perlebcdic>,
1014L<perlfunc/open>,
1015L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
1016L<utf8>,
1017the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
1018
1019=head1 MAINTAINER
1020
1021This project was originated by the late Nick Ing-Simmons and later
1022maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS
1023for a full list of people involved. For any questions, send mail to
1024I<< <perl-unicode@perl.org> >> so that we can all share.
1025
1026While Dan Kogai retains the copyright as a maintainer, credit
1027should go to all those involved. See AUTHORS for a list of those
1028who submitted code to the project.
1029
1030=head1 COPYRIGHT
1031
1032Copyright 2002-2013 Dan Kogai I<< <dankogai@cpan.org> >>.
1033
1034This library is free software; you can redistribute it and/or modify
1035it under the same terms as Perl itself.
1036
1037=cut