????

Your IP : 216.73.216.152


Current Path : /usr/local/lp/sonarperl/man/man3/
Upload File :
Current File : //usr/local/lp/sonarperl/man/man3/utf8.3

.\" Automatically generated by Pod::Man 4.07 (Pod::Simple 3.32)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.if !\nF .nr F 0
.if \nF>0 \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    if !\nF==2 \{\
.        nr % 0
.        nr F 2
.    \}
.\}
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "utf8 3"
.TH utf8 3 "2016-07-14" "perl v5.24.1" "Perl Programmers Reference Guide"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
utf8 \- Perl pragma to enable/disable UTF\-8 (or UTF\-EBCDIC) in source code
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\& use utf8;
\& no utf8;
\&
\& # Convert the internal representation of a Perl scalar to/from UTF\-8.
\&
\& $num_octets = utf8::upgrade($string);
\& $success    = utf8::downgrade($string[, $fail_ok]);
\&
\& # Change each character of a Perl scalar to/from a series of
\& # characters that represent the UTF\-8 bytes of each original character.
\&
\& utf8::encode($string);  # "\ex{100}"  becomes "\exc4\ex80"
\& utf8::decode($string);  # "\exc4\ex80" becomes "\ex{100}"
\&
\& # Convert a code point from the platform native character set to
\& # Unicode, and vice\-versa.
\& $unicode = utf8::native_to_unicode(ord(\*(AqA\*(Aq)); # returns 65 on both
\&                                               # ASCII and EBCDIC
\&                                               # platforms
\& $native = utf8::unicode_to_native(65);        # returns 65 on ASCII
\&                                               # platforms; 193 on
\&                                               # EBCDIC
\&
\& $flag = utf8::is_utf8($string); # since Perl 5.8.1
\& $flag = utf8::valid($string);
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The \f(CW\*(C`use utf8\*(C'\fR pragma tells the Perl parser to allow \s-1UTF\-8\s0 in the
program text in the current lexical scope.  The \f(CW\*(C`no utf8\*(C'\fR pragma tells Perl
to switch back to treating the source text as literal bytes in the current
lexical scope.  (On \s-1EBCDIC\s0 platforms, technically it is allowing UTF-EBCDIC,
and not \s-1UTF\-8,\s0 but this distinction is academic, so in this document the term
\&\s-1UTF\-8\s0 is used to mean both).
.PP
\&\fBDo not use this pragma for anything else than telling Perl that your
script is written in \s-1UTF\-8.\s0\fR The utility functions described below are
directly usable without \f(CW\*(C`use utf8;\*(C'\fR.
.PP
Because it is not possible to reliably tell \s-1UTF\-8\s0 from native 8 bit
encodings, you need either a Byte Order Mark at the beginning of your
source code, or \f(CW\*(C`use utf8;\*(C'\fR, to instruct perl.
.PP
When \s-1UTF\-8\s0 becomes the standard source format, this pragma will
effectively become a no-op.
.PP
See also the effects of the \f(CW\*(C`\-C\*(C'\fR switch and its cousin, the
\&\f(CW\*(C`PERL_UNICODE\*(C'\fR environment variable, in perlrun.
.PP
Enabling the \f(CW\*(C`utf8\*(C'\fR pragma has the following effect:
.IP "\(bu" 4
Bytes in the source text that are not in the \s-1ASCII\s0 character set will be
treated as being part of a literal \s-1UTF\-8\s0 sequence.  This includes most
literals such as identifier names, string constants, and constant
regular expression patterns.
.PP
Note that if you have non-ASCII, non\-UTF\-8 bytes in your script (for example
embedded Latin\-1 in your string literals), \f(CW\*(C`use utf8\*(C'\fR will be unhappy.  If
you want to have such bytes under \f(CW\*(C`use utf8\*(C'\fR, you can disable this pragma
until the end the block (or file, if at top level) by \f(CW\*(C`no utf8;\*(C'\fR.
.SS "Utility functions"
.IX Subsection "Utility functions"
The following functions are defined in the \f(CW\*(C`utf8::\*(C'\fR package by the
Perl core.  You do not need to say \f(CW\*(C`use utf8\*(C'\fR to use these and in fact
you should not say that unless you really want to have \s-1UTF\-8\s0 source code.
.IP "\(bu" 4
\&\f(CW\*(C`$num_octets = utf8::upgrade($string)\*(C'\fR
.Sp
(Since Perl v5.8.0)
Converts in-place the internal representation of the string from an octet
sequence in the native encoding (Latin\-1 or \s-1EBCDIC\s0) to \s-1UTF\-8.\s0 The
logical character sequence itself is unchanged.  If \fI\f(CI$string\fI\fR is already
stored as \s-1UTF\-8,\s0 then this is a no-op. Returns the
number of octets necessary to represent the string as \s-1UTF\-8. \s0 Can be
used to make sure that the \s-1UTF\-8\s0 flag is on, so that \f(CW\*(C`\ew\*(C'\fR or \f(CW\*(C`lc()\*(C'\fR
work as Unicode on strings containing non-ASCII characters whose code points
are below 256.
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`$success = utf8::downgrade($string[, $fail_ok])\*(C'\fR
.Sp
(Since Perl v5.8.0)
Converts in-place the internal representation of the string from
\&\s-1UTF\-8\s0 to the equivalent octet sequence in the native encoding (Latin\-1
or \s-1EBCDIC\s0). The logical character sequence itself is unchanged. If
\&\fI\f(CI$string\fI\fR is already stored as native 8 bit, then this is a no-op.  Can
be used to
make sure that the \s-1UTF\-8\s0 flag is off, e.g. when you want to make sure
that the \fIsubstr()\fR or \fIlength()\fR function works with the usually faster
byte algorithm.
.Sp
Fails if the original \s-1UTF\-8\s0 sequence cannot be represented in the
native 8 bit encoding. On failure dies or, if the value of \fI\f(CI$fail_ok\fI\fR is
true, returns false.
.Sp
Returns true on success.
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`utf8::encode($string)\*(C'\fR
.Sp
(Since Perl v5.8.0)
Converts in-place the character sequence to the corresponding octet
sequence in \s-1UTF\-8.\s0 That is, every (possibly wide) character gets
replaced with a sequence of one or more characters that represent the
individual \s-1UTF\-8\s0 bytes of the character.  The \s-1UTF8\s0 flag is turned off.
Returns nothing.
.Sp
.Vb 4
\& my $a = "\ex{100}"; # $a contains one character, with ord 0x100
\& utf8::encode($a);  # $a contains two characters, with ords (on
\&                    # ASCII platforms) 0xc4 and 0x80.  On EBCDIC
\&                    # 1047, this would instead be 0x8C and 0x41.
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`$success = utf8::decode($string)\*(C'\fR
.Sp
(Since Perl v5.8.0)
Attempts to convert in-place the octet sequence encoded as \s-1UTF\-8\s0 to the
corresponding character sequence. That is, it replaces each sequence of
characters in the string whose ords represent a valid \s-1UTF\-8\s0 byte
sequence, with the corresponding single character.  The \s-1UTF\-8\s0 flag is
turned on only if the source string contains multiple-byte \s-1UTF\-8\s0
characters.  If \fI\f(CI$string\fI\fR is invalid as \s-1UTF\-8,\s0 returns false;
otherwise returns true.
.Sp
.Vb 6
\& my $a = "\exc4\ex80"; # $a contains two characters, with ords
\&                     # 0xc4 and 0x80
\& utf8::decode($a);   # On ASCII platforms, $a contains one char,
\&                     # with ord 0x100.   Since these bytes aren\*(Aqt
\&                     # legal UTF\-EBCDIC, on EBCDIC platforms, $a is
\&                     # unchanged and the function returns FALSE.
.Ve
.Sp
\&\fBNote that this function does not handle arbitrary encodings\fR;
use Encode instead.
.IP "\(bu" 4
\&\f(CW\*(C`$unicode = utf8::native_to_unicode($code_point)\*(C'\fR
.Sp
(Since Perl v5.8.0)
This takes an unsigned integer (which represents the ordinal number of a
character (or a code point) on the platform the program is being run on) and
returns its Unicode equivalent value.  Since \s-1ASCII\s0 platforms natively use the
Unicode code points, this function returns its input on them.  On \s-1EBCDIC\s0
platforms it converts from \s-1EBCDIC\s0 to Unicode.
.Sp
A meaningless value will currently be returned if the input is not an unsigned
integer.
.Sp
Since Perl v5.22.0, calls to this function are optimized out on \s-1ASCII\s0
platforms, so there is no performance hit in using it there.
.IP "\(bu" 4
\&\f(CW\*(C`$native = utf8::unicode_to_native($code_point)\*(C'\fR
.Sp
(Since Perl v5.8.0)
This is the inverse of \f(CW\*(C`utf8::native_to_unicode()\*(C'\fR, converting the other
direction.  Again, on \s-1ASCII\s0 platforms, this returns its input, but on \s-1EBCDIC\s0
platforms it will find the native platform code point, given any Unicode one.
.Sp
A meaningless value will currently be returned if the input is not an unsigned
integer.
.Sp
Since Perl v5.22.0, calls to this function are optimized out on \s-1ASCII\s0
platforms, so there is no performance hit in using it there.
.IP "\(bu" 4
\&\f(CW\*(C`$flag = utf8::is_utf8($string)\*(C'\fR
.Sp
(Since Perl 5.8.1)  Test whether \fI\f(CI$string\fI\fR is marked internally as encoded in
\&\s-1UTF\-8. \s0 Functionally the same as \f(CW\*(C`Encode::is_utf8()\*(C'\fR.
.IP "\(bu" 4
\&\f(CW\*(C`$flag = utf8::valid($string)\*(C'\fR
.Sp
[\s-1INTERNAL\s0] Test whether \fI\f(CI$string\fI\fR is in a consistent state regarding
\&\s-1UTF\-8. \s0 Will return true if it is well-formed \s-1UTF\-8\s0 and has the \s-1UTF\-8\s0 flag
on \fBor\fR if \fI\f(CI$string\fI\fR is held as bytes (both these states are 'consistent').
Main reason for this routine is to allow Perl's test suite to check
that operations have left strings in a consistent state.  You most
probably want to use \f(CW\*(C`utf8::is_utf8()\*(C'\fR instead.
.PP
\&\f(CW\*(C`utf8::encode\*(C'\fR is like \f(CW\*(C`utf8::upgrade\*(C'\fR, but the \s-1UTF8\s0 flag is
cleared.  See perlunicode, and the C \s-1API\s0
functions \f(CW\*(C`sv_utf8_upgrade\*(C'\fR,
\&\f(CW\*(C`"sv_utf8_downgrade" in perlapi\*(C'\fR, \f(CW\*(C`"sv_utf8_encode" in perlapi\*(C'\fR,
and \f(CW\*(C`"sv_utf8_decode" in perlapi\*(C'\fR, which are wrapped by the Perl functions
\&\f(CW\*(C`utf8::upgrade\*(C'\fR, \f(CW\*(C`utf8::downgrade\*(C'\fR, \f(CW\*(C`utf8::encode\*(C'\fR and
\&\f(CW\*(C`utf8::decode\*(C'\fR.  Also, the functions \f(CW\*(C`utf8::is_utf8\*(C'\fR, \f(CW\*(C`utf8::valid\*(C'\fR,
\&\f(CW\*(C`utf8::encode\*(C'\fR, \f(CW\*(C`utf8::decode\*(C'\fR, \f(CW\*(C`utf8::upgrade\*(C'\fR, and \f(CW\*(C`utf8::downgrade\*(C'\fR are
actually internal, and thus always available, without a \f(CW\*(C`require utf8\*(C'\fR
statement.
.SH "BUGS"
.IX Header "BUGS"
Some filesystems may not support \s-1UTF\-8\s0 file names, or they may be supported
incompatibly with Perl.  Therefore \s-1UTF\-8\s0 names that are visible to the
filesystem, such as module names may not work.
.SH "SEE ALSO"
.IX Header "SEE ALSO"
perlunitut, perluniintro, perlrun, bytes, perlunicode