Filename | /usr/lib/x86_64-linux-gnu/perl/5.20/Encode/Alias.pm |
Statements | Executed 0 statements in 145µs |
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package Encode::Alias; | ||||
2 | use strict; | ||||
3 | use warnings; | ||||
4 | no warnings 'redefine'; | ||||
5 | our $VERSION = do { my @r = ( q$Revision: 2.18 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; | ||||
6 | use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; | ||||
7 | |||||
8 | use Exporter 'import'; | ||||
9 | |||||
10 | # Public, encouraged API is exported by default | ||||
11 | |||||
12 | our @EXPORT = | ||||
13 | qw ( | ||||
14 | define_alias | ||||
15 | find_alias | ||||
16 | ); | ||||
17 | |||||
18 | our @Alias; # ordered matching list | ||||
19 | our %Alias; # cached known aliases | ||||
20 | |||||
21 | sub find_alias { | ||||
22 | require Encode; | ||||
23 | my $class = shift; | ||||
24 | my $find = shift; | ||||
25 | unless ( exists $Alias{$find} ) { | ||||
26 | $Alias{$find} = undef; # Recursion guard | ||||
27 | for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { | ||||
28 | my $alias = $Alias[$i]; | ||||
29 | my $val = $Alias[ $i + 1 ]; | ||||
30 | my $new; | ||||
31 | if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { | ||||
32 | DEBUG and warn "eval $val"; | ||||
33 | 1 | 145µs | $new = eval $val; # spent 3µs executing statements in string eval | ||
34 | DEBUG and $@ and warn "$val, $@"; | ||||
35 | } | ||||
36 | elsif ( ref($alias) eq 'CODE' ) { | ||||
37 | DEBUG and warn "$alias", "->", "($find)"; | ||||
38 | $new = $alias->($find); | ||||
39 | } | ||||
40 | elsif ( lc($find) eq lc($alias) ) { | ||||
41 | $new = $val; | ||||
42 | } | ||||
43 | if ( defined($new) ) { | ||||
44 | next if $new eq $find; # avoid (direct) recursion on bugs | ||||
45 | DEBUG and warn "$alias, $new"; | ||||
46 | my $enc = | ||||
47 | ( ref($new) ) ? $new : Encode::find_encoding($new); | ||||
48 | if ($enc) { | ||||
49 | $Alias{$find} = $enc; | ||||
50 | last; | ||||
51 | } | ||||
52 | } | ||||
53 | } | ||||
54 | |||||
55 | # case insensitive search when canonical is not in all lowercase | ||||
56 | # RT ticket #7835 | ||||
57 | unless ( $Alias{$find} ) { | ||||
58 | my $lcfind = lc($find); | ||||
59 | for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) | ||||
60 | { | ||||
61 | $lcfind eq lc($name) or next; | ||||
62 | $Alias{$find} = Encode::find_encoding($name); | ||||
63 | DEBUG and warn "$find => $name"; | ||||
64 | } | ||||
65 | } | ||||
66 | } | ||||
67 | if (DEBUG) { | ||||
68 | my $name; | ||||
69 | if ( my $e = $Alias{$find} ) { | ||||
70 | $name = $e->name; | ||||
71 | } | ||||
72 | else { | ||||
73 | $name = ""; | ||||
74 | } | ||||
75 | warn "find_alias($class, $find)->name = $name"; | ||||
76 | } | ||||
77 | return $Alias{$find}; | ||||
78 | } | ||||
79 | |||||
80 | sub define_alias { | ||||
81 | while (@_) { | ||||
82 | my ( $alias, $name ) = splice( @_, 0, 2 ); | ||||
83 | unshift( @Alias, $alias => $name ); # newer one has precedence | ||||
84 | if ( ref($alias) ) { | ||||
85 | |||||
86 | # clear %Alias cache to allow overrides | ||||
87 | my @a = keys %Alias; | ||||
88 | for my $k (@a) { | ||||
89 | if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { | ||||
90 | DEBUG and warn "delete \$Alias\{$k\}"; | ||||
91 | delete $Alias{$k}; | ||||
92 | } | ||||
93 | elsif ( ref($alias) eq 'CODE' && $alias->($k) ) { | ||||
94 | DEBUG and warn "delete \$Alias\{$k\}"; | ||||
95 | delete $Alias{$k}; | ||||
96 | } | ||||
97 | } | ||||
98 | } | ||||
99 | else { | ||||
100 | DEBUG and warn "delete \$Alias\{$alias\}"; | ||||
101 | delete $Alias{$alias}; | ||||
102 | } | ||||
103 | } | ||||
104 | } | ||||
105 | |||||
106 | # Allow latin-1 style names as well | ||||
107 | # 0 1 2 3 4 5 6 7 8 9 10 | ||||
108 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); | ||||
109 | |||||
110 | # Allow winlatin1 style names as well | ||||
111 | our %Winlatin2cp = ( | ||||
112 | 'latin1' => 1252, | ||||
113 | 'latin2' => 1250, | ||||
114 | 'cyrillic' => 1251, | ||||
115 | 'greek' => 1253, | ||||
116 | 'turkish' => 1254, | ||||
117 | 'hebrew' => 1255, | ||||
118 | 'arabic' => 1256, | ||||
119 | 'baltic' => 1257, | ||||
120 | 'vietnamese' => 1258, | ||||
121 | ); | ||||
122 | |||||
123 | init_aliases(); | ||||
124 | |||||
125 | sub undef_aliases { | ||||
126 | @Alias = (); | ||||
127 | %Alias = (); | ||||
128 | } | ||||
129 | |||||
130 | sub init_aliases { | ||||
131 | require Encode; | ||||
132 | undef_aliases(); | ||||
133 | |||||
134 | # Try all-lower-case version should all else fails | ||||
135 | define_alias( qr/^(.*)$/ => '"\L$1"' ); | ||||
136 | |||||
137 | # UTF/UCS stuff | ||||
138 | define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' ); | ||||
139 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); | ||||
140 | define_alias( | ||||
141 | qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', | ||||
142 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', | ||||
143 | qr/^iso-10646-1$/i => '"UCS-2BE"' | ||||
144 | ); | ||||
145 | define_alias( | ||||
146 | qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', | ||||
147 | qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', | ||||
148 | qr/^UTF-?(16|32)$/i => '"UTF-$1"', | ||||
149 | ); | ||||
150 | |||||
151 | # ASCII | ||||
152 | define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); | ||||
153 | define_alias( 'C' => 'ascii' ); | ||||
154 | define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' ); | ||||
155 | |||||
156 | # Allow variants of iso-8859-1 etc. | ||||
157 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); | ||||
158 | |||||
159 | # At least HP-UX has these. | ||||
160 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); | ||||
161 | |||||
162 | # More HP stuff. | ||||
163 | define_alias( | ||||
164 | qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => | ||||
165 | '"${1}8"' ); | ||||
166 | |||||
167 | # The Official name of ASCII. | ||||
168 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); | ||||
169 | |||||
170 | # This is a font issue, not an encoding issue. | ||||
171 | # (The currency symbol of the Latin 1 upper half | ||||
172 | # has been redefined as the euro symbol.) | ||||
173 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); | ||||
174 | |||||
175 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => | ||||
176 | 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' | ||||
177 | ); | ||||
178 | |||||
179 | define_alias( | ||||
180 | qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| | ||||
181 | hebrew|arabic|baltic|vietnamese)$/ix => | ||||
182 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' | ||||
183 | ); | ||||
184 | |||||
185 | # Common names for non-latin preferred MIME names | ||||
186 | define_alias( | ||||
187 | 'ascii' => 'US-ascii', | ||||
188 | 'cyrillic' => 'iso-8859-5', | ||||
189 | 'arabic' => 'iso-8859-6', | ||||
190 | 'greek' => 'iso-8859-7', | ||||
191 | 'hebrew' => 'iso-8859-8', | ||||
192 | 'thai' => 'iso-8859-11', | ||||
193 | ); | ||||
194 | # RT #20781 | ||||
195 | define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); | ||||
196 | |||||
197 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. | ||||
198 | # And Microsoft has their own naming (again, surprisingly). | ||||
199 | # And windows-* is registered in IANA! | ||||
200 | define_alias( | ||||
201 | qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); | ||||
202 | |||||
203 | # Sometimes seen with a leading zero. | ||||
204 | # define_alias( qr/\bcp037\b/i => '"cp37"'); | ||||
205 | |||||
206 | # Mac Mappings | ||||
207 | # predefined in *.ucm; unneeded | ||||
208 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); | ||||
209 | define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' ); | ||||
210 | # http://rt.cpan.org/Ticket/Display.html?id=36326 | ||||
211 | define_alias( qr/^macintosh$/i => '"MacRoman"' ); | ||||
212 | # https://rt.cpan.org/Ticket/Display.html?id=78125 | ||||
213 | define_alias( qr/^macce$/i => '"MacCentralEurRoman"' ); | ||||
214 | # Ououououou. gone. They are different! | ||||
215 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); | ||||
216 | |||||
217 | # Standardize on the dashed versions. | ||||
218 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); | ||||
219 | |||||
220 | unless ($Encode::ON_EBCDIC) { | ||||
221 | |||||
222 | # for Encode::CN | ||||
223 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); | ||||
224 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); | ||||
225 | |||||
226 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) | ||||
227 | # CP936 doesn't have vendor-addon for GBK, so they're identical. | ||||
228 | define_alias( qr/^gbk$/i => '"cp936"' ); | ||||
229 | |||||
230 | # This fixes gb2312 vs. euc-cn confusion, practically | ||||
231 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); | ||||
232 | |||||
233 | # for Encode::JP | ||||
234 | define_alias( qr/\bjis$/i => '"7bit-jis"' ); | ||||
235 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); | ||||
236 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); | ||||
237 | define_alias( qr/\bujis$/i => '"euc-jp"' ); | ||||
238 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); | ||||
239 | define_alias( qr/\bsjis$/i => '"shiftjis"' ); | ||||
240 | define_alias( qr/\bwindows-31j$/i => '"cp932"' ); | ||||
241 | |||||
242 | # for Encode::KR | ||||
243 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); | ||||
244 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); | ||||
245 | |||||
246 | # This fixes ksc5601 vs. euc-kr confusion, practically | ||||
247 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); | ||||
248 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); | ||||
249 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); | ||||
250 | |||||
251 | # for Encode::TW | ||||
252 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); | ||||
253 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); | ||||
254 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); | ||||
255 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); | ||||
256 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); | ||||
257 | } | ||||
258 | |||||
259 | # utf8 is blessed :) | ||||
260 | define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' ); | ||||
261 | |||||
262 | # At last, Map white space and _ to '-' | ||||
263 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); | ||||
264 | } | ||||
265 | |||||
266 | 1; | ||||
267 | __END__ | ||||
268 | |||||
269 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 | ||||
270 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 | ||||
271 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) | ||||
272 | # TODO: Armenian encoding ARMSCII-8 | ||||
273 | # TODO: Hebrew encoding ISO-8859-8-1 | ||||
274 | # TODO: Thai encoding TCVN | ||||
275 | # TODO: Vietnamese encodings VPS | ||||
276 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese | ||||
277 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic | ||||
278 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese | ||||
279 | # Kannada Khmer Korean Laotian Malayalam Mongolian | ||||
280 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese | ||||
281 | |||||
282 | =head1 NAME | ||||
283 | |||||
284 | Encode::Alias - alias definitions to encodings | ||||
285 | |||||
286 | =head1 SYNOPSIS | ||||
287 | |||||
288 | use Encode; | ||||
289 | use Encode::Alias; | ||||
290 | define_alias( "newName" => ENCODING); | ||||
291 | define_alias( qr/.../ => ENCODING); | ||||
292 | define_alias( sub { return ENCODING if ...; } ); | ||||
293 | |||||
294 | =head1 DESCRIPTION | ||||
295 | |||||
296 | Allows newName to be used as an alias for ENCODING. ENCODING may be | ||||
297 | either the name of an encoding or an encoding object (as described | ||||
298 | in L<Encode>). | ||||
299 | |||||
300 | Currently the first argument to define_alias() can be specified in the | ||||
301 | following ways: | ||||
302 | |||||
303 | =over 4 | ||||
304 | |||||
305 | =item As a simple string. | ||||
306 | |||||
307 | =item As a qr// compiled regular expression, e.g.: | ||||
308 | |||||
309 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); | ||||
310 | |||||
311 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed | ||||
312 | in order to allow C<$1> etc. to be substituted. The example is one | ||||
313 | way to alias names as used in X11 fonts to the MIME names for the | ||||
314 | iso-8859-* family. Note the double quotes inside the single quotes. | ||||
315 | |||||
316 | (or, you don't have to do this yourself because this example is predefined) | ||||
317 | |||||
318 | If you are using a regex here, you have to use the quotes as shown or | ||||
319 | it won't work. Also note that regex handling is tricky even for the | ||||
320 | experienced. Use this feature with caution. | ||||
321 | |||||
322 | =item As a code reference, e.g.: | ||||
323 | |||||
324 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); | ||||
325 | |||||
326 | The same effect as the example above in a different way. The coderef | ||||
327 | takes the alias name as an argument and returns a canonical name on | ||||
328 | success or undef if not. Note the second argument is ignored if provided. | ||||
329 | Use this with even more caution than the regex version. | ||||
330 | |||||
331 | =back | ||||
332 | |||||
333 | =head3 Changes in code reference aliasing | ||||
334 | |||||
335 | As of Encode 1.87, the older form | ||||
336 | |||||
337 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); | ||||
338 | |||||
339 | no longer works. | ||||
340 | |||||
341 | Encode up to 1.86 internally used "local $_" to implement this older | ||||
342 | form. But consider the code below; | ||||
343 | |||||
344 | use Encode; | ||||
345 | $_ = "eeeee" ; | ||||
346 | while (/(e)/g) { | ||||
347 | my $utf = decode('aliased-encoding-name', $1); | ||||
348 | print "position:",pos,"\n"; | ||||
349 | } | ||||
350 | |||||
351 | Prior to Encode 1.86 this fails because of "local $_". | ||||
352 | |||||
353 | =head2 Alias overloading | ||||
354 | |||||
355 | You can override predefined aliases by simply applying define_alias(). | ||||
356 | The new alias is always evaluated first, and when necessary, | ||||
357 | define_alias() flushes the internal cache to make the new definition | ||||
358 | available. | ||||
359 | |||||
360 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a | ||||
361 | # superset of SHIFT_JIS | ||||
362 | |||||
363 | define_alias( qr/shift.*jis$/i => '"cp932"' ); | ||||
364 | define_alias( qr/sjis$/i => '"cp932"' ); | ||||
365 | |||||
366 | If you want to zap all predefined aliases, you can use | ||||
367 | |||||
368 | Encode::Alias->undef_aliases; | ||||
369 | |||||
370 | to do so. And | ||||
371 | |||||
372 | Encode::Alias->init_aliases; | ||||
373 | |||||
374 | gets the factory settings back. | ||||
375 | |||||
376 | Note that define_alias() will not be able to override the canonical name | ||||
377 | of encodings. Encodings are first looked up by canonical name before | ||||
378 | potential aliases are tried. | ||||
379 | |||||
380 | =head1 SEE ALSO | ||||
381 | |||||
382 | L<Encode>, L<Encode::Supported> | ||||
383 | |||||
384 | =cut | ||||
385 |