diff options
author | Stuart Henderson <sthen@cvs.openbsd.org> | 2013-03-25 20:09:46 +0000 |
---|---|---|
committer | Stuart Henderson <sthen@cvs.openbsd.org> | 2013-03-25 20:09:46 +0000 |
commit | 6eda9412f8dc717200dec22c57afdd2a46edfa8e (patch) | |
tree | c835d11c3e3f90de6a8561320ec54f077bfd358f /gnu/usr.bin/perl/pod | |
parent | 19adbb7d6dbd1516758cad91978165088a19e917 (diff) |
import perl 5.16.3 from CPAN - worked on by Andrew Fresh and myself
Diffstat (limited to 'gnu/usr.bin/perl/pod')
70 files changed, 19633 insertions, 9072 deletions
diff --git a/gnu/usr.bin/perl/pod/Makefile.SH b/gnu/usr.bin/perl/pod/Makefile.SH index 1d53de7f9dc..331e15c9f8f 100644 --- a/gnu/usr.bin/perl/pod/Makefile.SH +++ b/gnu/usr.bin/perl/pod/Makefile.SH @@ -54,57 +54,53 @@ LDLIBPTH = $ldlibpth ## In the following dollars and backticks do not need the extra backslash. $spitshell >>Makefile <<'!NO!SUBS!' -CONVERTERS = pod2html pod2latex pod2man pod2text \ - pod2usage podchecker podselect - HTMLROOT = / # Change this to fix cross-references in HTML -POD2HTML = pod2html \ - --htmlroot=$(HTMLROOT) \ - --podroot=.. --podpath=pod:lib:ext:vms \ - --libpods=perlfunc:perlguts:perlvar:perlrun:perlop +POD2HTML_ARGS = --htmlroot=$(HTMLROOT) --podroot=.. --podpath=pod:lib:ext:vms +POD2HTML = ../ext/Pod-Html/pod2html +POD2MAN = ../cpan/podlators/pod2man +POD2LATEX = ../cpan/Pod-LaTeX/pod2latex +PODCHECKER = ../cpan/Pod-Parser/podchecker PERL = ../miniperl PERLILIB = $(PERL) -I../lib REALPERL = ../perl -all: $(CONVERTERS) man - -converters: $(CONVERTERS) +all: man -man: pod2man $(MAN) +man: $(POD2MAN) $(MAN) -html: pod2html $(HTML) +html: $(POD2HTML) $(HTML) -tex: pod2latex $(TEX) +tex: $(POD2LATEX) $(TEX) toc perltoc.pod: buildtoc - $(PERLILIB) buildtoc --build-toc + $(PERLILIB) buildtoc .SUFFIXES: .pm .pod .SUFFIXES: .man -.pm.man: pod2man - $(REALPERL) -I../lib pod2man $*.pm >$*.man +.pm.man: $(POD2MAN) + $(REALPERL) -I../lib $(POD2MAN) $*.pm >$*.man -.pod.man: pod2man - $(REALPERL) -I../lib pod2man $*.pod >$*.man +.pod.man: $(POD2MAN) + $(REALPERL) -I../lib $(POD2MAN) $*.pod >$*.man .SUFFIXES: .html -.pm.html: pod2html - $(PERL) -I../lib $(POD2HTML) --infile=$*.pm --outfile=$*.html +.pm.html: $(POD2HTML) + $(PERL) -I../lib $(POD2HTML) $(POD2HTML_ARGS) --infile=$*.pm --outfile=$*.html -.pod.html: pod2html - $(PERL) -I../lib $(POD2HTML) --infile=$*.pod --outfile=$*.html +.pod.html: $(POD2HTML) + $(PERL) -I../lib $(POD2HTML) $(POD2HTML_ARGS) --infile=$*.pod --outfile=$*.html .SUFFIXES: .tex -.pm.tex: pod2latex - $(PERL) -I../lib pod2latex $*.pm +.pm.tex: $(POD2LATEX) + $(PERL) -I../lib $(POD2LATEX) $*.pm -.pod.tex: pod2latex - $(PERL) -I../lib pod2latex $*.pod +.pod.tex: $(POD2LATEX) + $(PERL) -I../lib $(POD2LATEX) $*.pod clean: rm -f $(MAN) @@ -114,37 +110,13 @@ clean: rm -f *.aux *.log *.exe realclean: clean - rm -f $(CONVERTERS) distclean: realclean veryclean: distclean -rm -f *~ *.orig -check: podchecker +check: $(PODCHECKER) @echo "checking..."; \ - $(PERL) -I../lib podchecker $(POD) - -# Dependencies. -pod2latex: pod2latex.PL ../lib/Config.pm - $(LDLIBPTH) $(PERL) -I../lib pod2latex.PL - -pod2html: pod2html.PL ../lib/Config.pm - $(LDLIBPTH) $(PERL) -I ../lib pod2html.PL - -pod2man: pod2man.PL ../lib/Config.pm - $(LDLIBPTH) $(PERL) -I ../lib pod2man.PL - -pod2text: pod2text.PL ../lib/Config.pm - $(LDLIBPTH) $(PERL) -I ../lib pod2text.PL - -pod2usage: pod2usage.PL ../lib/Config.pm - $(PERL) -I ../lib pod2usage.PL - -podchecker: podchecker.PL ../lib/Config.pm - $(PERL) -I ../lib podchecker.PL - -podselect: podselect.PL ../lib/Config.pm - $(PERL) -I ../lib podselect.PL - + $(PERL) -I../lib $(PODCHECKER) $(POD) !NO!SUBS! diff --git a/gnu/usr.bin/perl/pod/perl5004delta.pod b/gnu/usr.bin/perl/pod/perl5004delta.pod index c83f3e6afd4..fc5ae62bb85 100644 --- a/gnu/usr.bin/perl/pod/perl5004delta.pod +++ b/gnu/usr.bin/perl/pod/perl5004delta.pod @@ -753,7 +753,7 @@ and above) or the Borland C++ compiler (versions 5.02 and above). The resulting perl can be used under Windows 95 (if it is installed in the same directory locations as it got installed in Windows NT). This port includes support for perl extension -building tools like L<MakeMaker> and L<h2xs>, so that many extensions +building tools like L<ExtUtils::MakeMaker> and L<h2xs>, so that many extensions available on the Comprehensive Perl Archive Network (CPAN) can now be readily built under Windows NT. See http://www.perl.com/ for more information on CPAN and F<README.win32> in the perl distribution for more diff --git a/gnu/usr.bin/perl/pod/perl5005delta.pod b/gnu/usr.bin/perl/pod/perl5005delta.pod index cabdf9eb7cb..62661254a21 100644 --- a/gnu/usr.bin/perl/pod/perl5005delta.pod +++ b/gnu/usr.bin/perl/pod/perl5005delta.pod @@ -523,7 +523,7 @@ encapsulation of Perl. GCC and EGCS are now supported on Win32. See F<README.win32>, aka L<perlwin32>. VMS configuration system has been rewritten. See F<README.vms> (installed -as L<README_vms> on some systems). +as F<README_vms> on some systems). The hints files for most Unix platforms have seen incremental improvements. @@ -722,7 +722,7 @@ imported with the C<use subs> pragma). To silently interpret it as the Perl operator, use the C<CORE::> prefix on the operator (e.g. C<CORE::log($x)>) or by declaring the subroutine -to be an object method (see L<attrs>). +to be an object method (see L</attrs>). =item Bad index while coercing array into hash diff --git a/gnu/usr.bin/perl/pod/perl5100delta.pod b/gnu/usr.bin/perl/pod/perl5100delta.pod index e93c316954a..4e5c6d3a2b6 100644 --- a/gnu/usr.bin/perl/pod/perl5100delta.pod +++ b/gnu/usr.bin/perl/pod/perl5100delta.pod @@ -844,7 +844,7 @@ of C<CPANPLUS>. =item * C<Archive::Extract> is a generic archive extraction mechanism -for F<.tar> (plain, gziped or bzipped) or F<.zip> files. +for F<.tar> (plain, gzipped or bzipped) or F<.zip> files. =item * diff --git a/gnu/usr.bin/perl/pod/perl5101delta.pod b/gnu/usr.bin/perl/pod/perl5101delta.pod index c6cdef977ae..415ab6be245 100644 --- a/gnu/usr.bin/perl/pod/perl5101delta.pod +++ b/gnu/usr.bin/perl/pod/perl5101delta.pod @@ -1751,7 +1751,8 @@ analysed by the Perl porting team. If the bug you are reporting has security implications, which make it inappropriate to send to a publicly archived mailing list, then please send it to perl5-security-report@perl.org. This points to a closed subscription -unarchived mailing list, which includes all the core committers, who be able +unarchived mailing list, which includes +all the core committers, who will be able to help assess the impact of issues, figure out a resolution, and help co-ordinate the release of patches to mitigate or fix the problem across all platforms on which Perl is supported. Please only use this address for diff --git a/gnu/usr.bin/perl/pod/perl5120delta.pod b/gnu/usr.bin/perl/pod/perl5120delta.pod index 5d5b401e784..f8a1810c861 100644 --- a/gnu/usr.bin/perl/pod/perl5120delta.pod +++ b/gnu/usr.bin/perl/pod/perl5120delta.pod @@ -199,9 +199,9 @@ See the documentation for C<ExtUtils::MakeMaker> or C<Module::Build> for more on how to specify C<configure_requires> when creating a distribution for CPAN. -=head2 C<each> is now more flexible +=head2 C<each>, C<keys>, C<values> are now more flexible -The C<each> function can now operate on arrays. +The C<each>, C<keys>, C<values> function can now operate on arrays. =head2 C<when> as a statement modifier @@ -287,7 +287,7 @@ those installed in C<ARCHLIB> and C<PRIVLIB>. =head2 REGEXPs are now first class -Internally, Perl now treates compiled regular expressions (such as +Internally, Perl now treats compiled regular expressions (such as those created with C<qr//>) as first class entities. Perl modules which serialize, deserialize or otherwise have deep interaction with Perl's internal data structures need to be updated for this change. Most @@ -456,7 +456,6 @@ have been removed from this distribution. C<Module::CoreList> no longer contains the C<%:patchlevel> hash. - =item * C<length undef> now returns undef. @@ -497,7 +496,8 @@ longer be used as an attribute. =item * Perl's command-line switch "-P", which was deprecated in version 5.10.0, has -now been removed. +now been removed. The CPAN module C<< Filter::cpp >> can be used as an +alternative. =back @@ -528,7 +528,6 @@ The following items are now deprecated: C<suidperl> is no longer part of Perl. It used to provide a mechanism to emulate setuid permission bits on systems that don't support it properly. - =item Use of C<:=> to mean an empty attribute list An accident of Perl's parser meant that these constructions were all @@ -560,7 +559,6 @@ The method C<< UNIVERSAL->import() >> is now deprecated. Attempting to pass import arguments to a C<use UNIVERSAL> statement will result in a deprecation warning. - =item Use of "goto" to jump into a construct Using C<goto> to jump from an outer scope into an inner scope is now @@ -588,7 +586,7 @@ on CPAN which require these should add them to their prerequisites. The core versions of these modules warnings will issue a deprecation warning. If you ship a packaged version of Perl, either alone or as part of a -larger system, then you should carefully consider the reprecussions of +larger system, then you should carefully consider the repercussions of core module deprecations. You may want to consider shipping your default build of Perl with packages for some or all deprecated modules which install into C<vendor> or C<site> perl library directories. This will @@ -1432,7 +1430,6 @@ the beginnings of a document on Perl porting policies. =over - =item * The various large F<Changes*> files (which listed every change made @@ -1446,7 +1443,6 @@ F<Porting/patching.pod> has been deleted, as it mainly described interacting with the old Perforce-based repository, which is now obsolete. Information still relevant has been moved to L<perlrepository>. - =item * The syntax C<unless (EXPR) BLOCK else BLOCK> is now documented as valid, @@ -1454,7 +1450,6 @@ as is the syntax C<unless (EXPR) BLOCK elsif (EXPR) BLOCK ... else BLOCK>, although actually using the latter may not be the best idea for the readability of your source code. - =item * Documented -X overloading. @@ -1514,7 +1509,7 @@ The documentation for C<$1> in perlvar.pod has been clarified. =item * -C<\N{U+I<wide hex char>}> is now documented. +C<\N{U+I<code point>}> is now documented. =back @@ -1665,8 +1660,8 @@ C<\N{...}> now compiles better, always forces UTF-8 internal representation Perl's developers have fixed several problems with the recognition of C<\N{...}> constructs. As part of this, perl will store any scalar -or regex containing C<\N{I<name>}> or C<\N{U+I<wide hex char>}> in its -definition in UTF-8 format. (This was true previously for all occurences +or regex containing C<\N{I<name>}> or C<\N{U+I<code point>}> in its +definition in UTF-8 format. (This was true previously for all occurrences of C<\N{I<name>}> that did not use a custom translator, but now it's always true.) @@ -2105,7 +2100,6 @@ passed a position that is not within the scalar's string buffer. This could be caused by buggy XS code, and at this point recovery is not possible. - =item * The fatal error C<Malformed UTF-8 returned by \N> is now produced if the @@ -3175,7 +3169,8 @@ analyzed by the Perl porting team. If the bug you are reporting has security implications, which make it inappropriate to send to a publicly archived mailing list, then please send it to perl5-security-report@perl.org. This points to a closed subscription -unarchived mailing list, which includes all the core committers, who be able +unarchived mailing list, which includes +all the core committers, who will be able to help assess the impact of issues, figure out a resolution, and help co-ordinate the release of patches to mitigate or fix the problem across all platforms on which Perl is supported. Please only use this address for diff --git a/gnu/usr.bin/perl/pod/perl5121delta.pod b/gnu/usr.bin/perl/pod/perl5121delta.pod index 2c8a08fb289..a999f144848 100644 --- a/gnu/usr.bin/perl/pod/perl5121delta.pod +++ b/gnu/usr.bin/perl/pod/perl5121delta.pod @@ -2,7 +2,7 @@ =head1 NAME -perldelta - what is new for perl v5.12.1 +perl5121delta - what is new for perl v5.12.1 =head1 DESCRIPTION @@ -60,7 +60,7 @@ We made a small fix to the L<CPANPLUS> test suite to fix an occasional spurious =item * -We upgraded L<Safe> to version 2.27 to wrap coderefs retured by C<reval()> and C<rdo()>. +We upgraded L<Safe> to version 2.27 to wrap coderefs returned by C<reval()> and C<rdo()>. =back @@ -210,12 +210,10 @@ XSUB.h now correctly redefines fgets under PERL_IMPLICIT_SYS See also: L<http://rt.cpan.org/Public/Bug/Display.html?id=55049> - =item * utf8::is_utf8 now respects GMAGIC (e.g. $1) - =item * XS code using C<fputc()> or C<fputs()>: on Windows could cause an error @@ -257,7 +255,7 @@ See also: L<http://rt.perl.org/rt3/Public/Bug/Display.html?id=74290> =item * We fixed a regression in case-insensitive matching of folded characters -in regular expressions introduced in Perl 5.12.0. +in regular expressions introduced in Perl 5.10.1. See also: L<http://rt.perl.org/rt3/Public/Bug/Display.html?id=72998> @@ -390,7 +388,8 @@ analysed by the Perl porting team. If the bug you are reporting has security implications, which make it inappropriate to send to a publicly archived mailing list, then please send it to perl5-security-report@perl.org. This points to a closed subscription -unarchived mailing list, which includes all the core committers, who be able +unarchived mailing list, which includes +all the core committers, who will be able to help assess the impact of issues, figure out a resolution, and help co-ordinate the release of patches to mitigate or fix the problem across all platforms on which Perl is supported. Please only use this address for diff --git a/gnu/usr.bin/perl/pod/perl5122delta.pod b/gnu/usr.bin/perl/pod/perl5122delta.pod index 5e7418a9598..f441a385812 100644 --- a/gnu/usr.bin/perl/pod/perl5122delta.pod +++ b/gnu/usr.bin/perl/pod/perl5122delta.pod @@ -2,7 +2,7 @@ =head1 NAME -perldelta - what is new for perl v5.12.2 +perl5122delta - what is new for perl v5.12.2 =head1 DESCRIPTION @@ -58,8 +58,8 @@ overriding C<caller()> incorrectly. =item C<CPANPLUS> A patch to F<cpanp-run-perl> has been backported from CPANPLUS C<0.9004>. This -resolves L<[perl #55964]|http://rt.perl.org/rt3/Ticket/Display.html?id=55964> -and L<[perl #57106]|http://rt.perl.org/rt3/Ticket/Display.html?id=57106>, both +resolves L<RT #55964|http://rt.cpan.org/Public/Bug/Display.html?id=55964> +and L<RT #57106|http://rt.cpan.org/Public/Bug/Display.html?id=57106>, both of which related to failures to install distributions that use C<Module::Install::DSL>. @@ -89,12 +89,12 @@ name is empty; C<abs2rel()> properly handles Unix-style input. =over -=item * +=item * F<perlbug> now always gives the reporter a chance to change the email address it guesses for them. -=item * +=item * F<perlbug> should no longer warn about uninitialized values when using the C<-d> and C<-v> options. @@ -226,7 +226,7 @@ now supports get/set magic and thus tied buffers correctly. =item * -The C<pp_getc>, C<pp_tell>, and C<pp_eof> opcodes now make room on the +The C<pp_getc>, C<pp_tell>, and C<pp_eof> opcodes now make room on the stack for their return values in cases where no argument was passed in. =item * @@ -241,7 +241,7 @@ See L<[perl #75680]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=75680> =head2 AIX -=over +=over =item * @@ -258,7 +258,7 @@ suite. When building Perl with the mingw64 x64 cross-compiler C<incpath>, C<libpth>, C<ldflags>, C<lddlflags> and C<ldflags_nolargefiles> values -in F<Config.pm> and F<Config_heavy.pl> were not previously not being set +in F<Config.pm> and F<Config_heavy.pl> were not previously being set correctly because, with that compiler, the include and lib directories are not immediately below C<$(CCHOME)>. @@ -325,7 +325,8 @@ analysed by the Perl porting team. If the bug you are reporting has security implications, which make it inappropriate to send to a publicly archived mailing list, then please send it to perl5-security-report@perl.org. This points to a closed subscription -unarchived mailing list, which includes all the core committers, who be able +unarchived mailing list, which includes +all the core committers, who will be able to help assess the impact of issues, figure out a resolution, and help co-ordinate the release of patches to mitigate or fix the problem across all platforms on which Perl is supported. Please only use this address for diff --git a/gnu/usr.bin/perl/pod/perl5123delta.pod b/gnu/usr.bin/perl/pod/perl5123delta.pod new file mode 100644 index 00000000000..580af240169 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5123delta.pod @@ -0,0 +1,118 @@ +=encoding utf8 + +=head1 NAME + +perl5123delta - what is new for perl v5.12.3 + +=head1 DESCRIPTION + +This document describes differences between the 5.12.2 release and +the 5.12.3 release. + +If you are upgrading from an earlier release such as 5.12.1, first read +L<perl5122delta>, which describes differences between 5.12.1 and +5.12.2. The major changes made in 5.12.0 are described in L<perl5120delta>. + +=head1 Incompatible Changes + + There are no changes intentionally incompatible with 5.12.2. If any + exist, they are bugs and reports are welcome. + +=head1 Core Enhancements + +=head2 C<keys>, C<values> work on arrays + +You can now use the C<keys>, C<values>, C<each> builtin functions on arrays +(previously you could only use them on hashes). See L<perlfunc> for details. +This is actually a change introduced in perl 5.12.0, but it was missed from +that release's perldelta. + +=head1 Bug Fixes + +"no VERSION" will now correctly deparse with B::Deparse, as will certain +constant expressions. + +Module::Build should be more reliably pass its tests under cygwin. + +Lvalue subroutines are again able to return copy-on-write scalars. This +had been broken since version 5.10.0. + +=head1 Platform Specific Notes + +=over 4 + +=item Solaris + +A separate DTrace is now build for miniperl, which means that perl can be +compiled with -Dusedtrace on Solaris again. + +=item VMS + +A number of regressions on VMS have been fixed. In addition to minor cleanup +of questionable expressions in F<vms.c>, file permissions should no longer be +garbled by the PerlIO layer, and spurious record boundaries should no longer be +introduced by the PerlIO layer during output. + +For more details and discussion on the latter, see: + + http://www.nntp.perl.org/group/perl.vmsperl/2010/11/msg15419.html + +=item VOS + +A few very small changes were made to the build process on VOS to better +support the platform. Longer-than-32-character filenames are now supported on +OpenVOS, and build properly without IPv6 support. + +=back + +=head1 Acknowledgements + +Perl 5.12.3 represents approximately four months of development since +Perl 5.12.2 and contains approximately 2500 lines of changes across +54 files from 16 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.12.3: + +Craig A. Berry, David Golden, David Leadbeater, Father Chrysostomos, Florian +Ragwitz, Jesse Vincent, Karl Williamson, Nick Johnston, Nicolas Kaiser, Paul +Green, Rafael Garcia-Suarez, Rainer Tammer, Ricardo Signes, Steffen Mueller, +Zsbán Ambrus, Ævar Arnfjörð Bjarmason + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the B<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please send +it to perl5-security-report@perl.org. This points to a closed subscription +unarchived mailing list, which includes +all the core committers, who will be able +to help assess the impact of issues, figure out a resolution, and help +co-ordinate the release of patches to mitigate or fix the problem across all +platforms on which Perl is supported. Please only use this address for +security issues in the Perl core, not for modules independently +distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5124delta.pod b/gnu/usr.bin/perl/pod/perl5124delta.pod new file mode 100644 index 00000000000..bd3a1db43be --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5124delta.pod @@ -0,0 +1,108 @@ +=encoding utf8 + +=head1 NAME + +perl5124delta - what is new for perl v5.12.4 + +=head1 DESCRIPTION + +This document describes differences between the 5.12.3 release and +the 5.12.4 release. + +If you are upgrading from an earlier release such as 5.12.2, first read +L<perl5123delta>, which describes differences between 5.12.2 +and 5.12.3. The major changes made in 5.12.0 are described in L<perl5120delta>. + +=head1 Incompatible Changes + +There are no changes intentionally incompatible with 5.12.3. If any +exist, they are bugs and reports are welcome. + +=head1 Selected Bug Fixes + +When strict "refs" mode is off, C<%{...}> in rvalue context returns +C<undef> if its argument is undefined. An optimisation introduced in Perl +5.12.0 to make C<keys %{...}> faster when used as a boolean did not take +this into account, causing C<keys %{+undef}> (and C<keys %$foo> when +C<$foo> is undefined) to be an error, which it should be so in strict +mode only [perl #81750]. + +C<lc>, C<uc>, C<lcfirst>, and C<ucfirst> no longer return untainted strings +when the argument is tainted. This has been broken since perl 5.8.9 +[perl #87336]. + +Fixed a case where it was possible that a freed buffer may have been read +from when parsing a here document. + +=head1 Modules and Pragmata + +L<Module::CoreList> has been upgraded from version 2.43 to 2.50. + +=head1 Testing + +The F<cpan/CGI/t/http.t> test script has been fixed to work when the +environment has HTTPS_* environment variables, such as HTTPS_PROXY. + +=head1 Documentation + +Updated the documentation for rand() in L<perlfunc> to note that it is not +cryptographically secure. + +=head1 Platform Specific Notes + +=over 4 + +=item Linux + +Support Ubuntu 11.04's new multi-arch library layout. + +=back + +=head1 Acknowledgements + +Perl 5.12.4 represents approximately 5 months of development since +Perl 5.12.3 and contains approximately 200 lines of changes across +11 files from 8 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.12.4: + +Andy Dougherty, David Golden, David Leadbeater, Father Chrysostomos, +Florian Ragwitz, Jesse Vincent, Leon Brocard, Zsbán Ambrus. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the B<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please send +it to perl5-security-report@perl.org. This points to a closed subscription +unarchived mailing list, which includes all the core committers, who be able +to help assess the impact of issues, figure out a resolution, and help +co-ordinate the release of patches to mitigate or fix the problem across all +platforms on which Perl is supported. Please only use this address for +security issues in the Perl core, not for modules independently +distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5140delta.pod b/gnu/usr.bin/perl/pod/perl5140delta.pod new file mode 100644 index 00000000000..74c82a8e141 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5140delta.pod @@ -0,0 +1,4590 @@ +=encoding utf8 + +=head1 NAME + +perl5140delta - what is new for perl v5.14.0 + +=head1 DESCRIPTION + +This document describes differences between the 5.12.0 release and +the 5.14.0 release. + +If you are upgrading from an earlier release such as 5.10.0, first read +L<perl5120delta>, which describes differences between 5.10.0 and +5.12.0. + +Some of the bug fixes in this release have been backported to subsequent +releases of 5.12.x. Those are indicated with the 5.12.x version in +parentheses. + +=head1 Notice + +As described in L<perlpolicy>, the release of Perl 5.14.0 marks the +official end of support for Perl 5.10. Users of Perl 5.10 or earlier +should consider upgrading to a more recent release of Perl. + +=head1 Core Enhancements + +=head2 Unicode + +=head3 Unicode Version 6.0 is now supported (mostly) + +Perl comes with the Unicode 6.0 data base updated with +L<Corrigendum #8|http://www.unicode.org/versions/corrigendum8.html>, +with one exception noted below. +See L<http://unicode.org/versions/Unicode6.0.0/> for details on the new +release. Perl does not support any Unicode provisional properties, +including the new ones for this release. + +Unicode 6.0 has chosen to use the name C<BELL> for the character at U+1F514, +which is a symbol that looks like a bell, and is used in Japanese cell +phones. This conflicts with the long-standing Perl usage of having +C<BELL> mean the ASCII C<BEL> character, U+0007. In Perl 5.14, +C<\N{BELL}> continues to mean U+0007, but its use generates a +deprecation warning message unless such warnings are turned off. The +new name for U+0007 in Perl is C<ALERT>, which corresponds nicely +with the existing shorthand sequence for it, C<"\a">. C<\N{BEL}> +means U+0007, with no warning given. The character at U+1F514 has no +name in 5.14, but can be referred to by C<\N{U+1F514}>. +In Perl 5.16, C<\N{BELL}> will refer to U+1F514; all code +that uses C<\N{BELL}> should be converted to use C<\N{ALERT}>, +C<\N{BEL}>, or C<"\a"> before upgrading. + +=head3 Full functionality for C<use feature 'unicode_strings'> + +This release provides full functionality for C<use feature +'unicode_strings'>. Under its scope, all string operations executed and +regular expressions compiled (even if executed outside its scope) have +Unicode semantics. See L<feature/"the 'unicode_strings' feature">. +However, see L</Inverted bracketed character classes and multi-character folds>, +below. + +This feature avoids most forms of the "Unicode Bug" (see +L<perlunicode/The "Unicode Bug"> for details). If there is any +possibility that your code will process Unicode strings, you are +I<strongly> encouraged to use this subpragma to avoid nasty surprises. + +=head3 C<\N{I<NAME>}> and C<charnames> enhancements + +=over + +=item * + +C<\N{I<NAME>}> and C<charnames::vianame> now know about the abbreviated +character names listed by Unicode, such as NBSP, SHY, LRO, ZWJ, etc.; all +customary abbreviations for the C0 and C1 control characters (such as +ACK, BEL, CAN, etc.); and a few new variants of some C1 full names that +are in common usage. + +=item * + +Unicode has several I<named character sequences>, in which particular sequences +of code points are given names. C<\N{I<NAME>}> now recognizes these. + +=item * + +C<\N{I<NAME>}>, C<charnames::vianame>, and C<charnames::viacode> +now know about every character in Unicode. In earlier releases of +Perl, they didn't know about the Hangul syllables nor several +CJK (Chinese/Japanese/Korean) characters. + +=item * + +It is now possible to override Perl's abbreviations with your own custom aliases. + +=item * + +You can now create a custom alias of the ordinal of a +character, known by C<\N{I<NAME>}>, C<charnames::vianame()>, and +C<charnames::viacode()>. Previously, aliases had to be to official +Unicode character names. This made it impossible to create an alias for +unnamed code points, such as those reserved for private +use. + +=item * + +The new function charnames::string_vianame() is a run-time version +of C<\N{I<NAME>}}>, returning the string of characters whose Unicode +name is its parameter. It can handle Unicode named character +sequences, whereas the pre-existing charnames::vianame() cannot, +as the latter returns a single code point. + +=back + +See L<charnames> for details on all these changes. + +=head3 New warnings categories for problematic (non-)Unicode code points. + +Three new warnings subcategories of "utf8" have been added. These +allow you to turn off some "utf8" warnings, while allowing +other warnings to remain on. The three categories are: +C<surrogate> when UTF-16 surrogates are encountered; +C<nonchar> when Unicode non-character code points are encountered; +and C<non_unicode> when code points above the legal Unicode +maximum of 0x10FFFF are encountered. + +=head3 Any unsigned value can be encoded as a character + +With this release, Perl is adopting a model that any unsigned value +can be treated as a code point and encoded internally (as utf8) +without warnings, not just the code points that are legal in Unicode. +However, unless utf8 or the corresponding sub-category (see previous +item) of lexical warnings have been explicitly turned off, outputting +or executing a Unicode-defined operation such as upper-casing +on such a code point generates a warning. Attempting to input these +using strict rules (such as with the C<:encoding(UTF-8)> layer) +will continue to fail. Prior to this release, handling was +inconsistent and in places, incorrect. + +Unicode non-characters, some of which previously were erroneously +considered illegal in places by Perl, contrary to the Unicode Standard, +are now always legal internally. Inputting or outputting them +works the same as with the non-legal Unicode code points, because the Unicode +Standard says they are (only) illegal for "open interchange". + +=head3 Unicode database files not installed + +The Unicode database files are no longer installed with Perl. This +doesn't affect any functionality in Perl and saves significant disk +space. If you need these files, you can download them from +L<http://www.unicode.org/Public/zipped/6.0.0/>. + +=head2 Regular Expressions + +=head3 C<(?^...)> construct signifies default modifiers + +An ASCII caret C<"^"> immediately following a C<"(?"> in a regular +expression now means that the subexpression does not inherit surrounding +modifiers such as C</i>, but reverts to the Perl defaults. Any modifiers +following the caret override the defaults. + +Stringification of regular expressions now uses this notation. +For example, C<qr/hlagh/i> would previously be stringified as +C<(?i-xsm:hlagh)>, but now it's stringified as C<(?^i:hlagh)>. + +The main purpose of this change is to allow tests that rely on the +stringification I<not> to have to change whenever new modifiers are added. +See L<perlre/Extended Patterns>. + +This change is likely to break code that compares stringified regular +expressions with fixed strings containing C<?-xism>. + +=head3 C</d>, C</l>, C</u>, and C</a> modifiers + +Four new regular expression modifiers have been added. These are mutually +exclusive: one only can be turned on at a time. + +=over + +=item * + +The C</l> modifier says to compile the regular expression as if it were +in the scope of C<use locale>, even if it is not. + +=item * + +The C</u> modifier says to compile the regular expression as if it were +in the scope of a C<use feature 'unicode_strings'> pragma. + +=item * + +The C</d> (default) modifier is used to override any C<use locale> and +C<use feature 'unicode_strings'> pragmas in effect at the time +of compiling the regular expression. + +=item * + +The C</a> regular expression modifier restricts C<\s>, C<\d> and C<\w> and +the POSIX (C<[[:posix:]]>) character classes to the ASCII range. Their +complements and C<\b> and C<\B> are correspondingly +affected. Otherwise, C</a> behaves like the C</u> modifier, in that +case-insensitive matching uses Unicode semantics. + +If the C</a> modifier is repeated, then additionally in case-insensitive +matching, no ASCII character can match a non-ASCII character. +For example, + + "k" =~ /\N{KELVIN SIGN}/ai + "\xDF" =~ /ss/ai + +match but + + "k" =~ /\N{KELVIN SIGN}/aai + "\xDF" =~ /ss/aai + +do not match. + +=back + +See L<perlre/Modifiers> for more detail. + +=head3 Non-destructive substitution + +The substitution (C<s///>) and transliteration +(C<y///>) operators now support an C</r> option that +copies the input variable, carries out the substitution on +the copy, and returns the result. The original remains unmodified. + + my $old = "cat"; + my $new = $old =~ s/cat/dog/r; + # $old is "cat" and $new is "dog" + +This is particularly useful with C<map>. See L<perlop> for more examples. + +=head3 Re-entrant regular expression engine + +It is now safe to use regular expressions within C<(?{...})> and +C<(??{...})> code blocks inside regular expressions. + +These blocks are still experimental, however, and still have problems with +lexical (C<my>) variables and abnormal exiting. + +=head3 C<use re '/flags'> + +The C<re> pragma now has the ability to turn on regular expression flags +till the end of the lexical scope: + + use re "/x"; + "foo" =~ / (.+) /; # /x implied + +See L<re/"'/flags' mode"> for details. + +=head3 \o{...} for octals + +There is a new octal escape sequence, C<"\o">, in doublequote-like +contexts. This construct allows large octal ordinals beyond the +current max of 0777 to be represented. It also allows you to specify a +character in octal which can safely be concatenated with other regex +snippets and which won't be confused with being a backreference to +a regex capture group. See L<perlre/Capture groups>. + +=head3 Add C<\p{Titlecase}> as a synonym for C<\p{Title}> + +This synonym is added for symmetry with the Unicode property names +C<\p{Uppercase}> and C<\p{Lowercase}>. + +=head3 Regular expression debugging output improvement + +Regular expression debugging output (turned on by C<use re 'debug'>) now +uses hexadecimal when escaping non-ASCII characters, instead of octal. + +=head3 Return value of C<delete $+{...}> + +Custom regular expression engines can now determine the return value of +C<delete> on an entry of C<%+> or C<%->. + +=head2 Syntactical Enhancements + +=head3 Array and hash container functions accept references + +B<Warning:> This feature is considered experimental, as the exact behaviour +may change in a future version of Perl. + +All builtin functions that operate directly on array or hash +containers now also accept unblessed hard references to arrays +or hashes: + + |----------------------------+---------------------------| + | Traditional syntax | Terse syntax | + |----------------------------+---------------------------| + | push @$arrayref, @stuff | push $arrayref, @stuff | + | unshift @$arrayref, @stuff | unshift $arrayref, @stuff | + | pop @$arrayref | pop $arrayref | + | shift @$arrayref | shift $arrayref | + | splice @$arrayref, 0, 2 | splice $arrayref, 0, 2 | + | keys %$hashref | keys $hashref | + | keys @$arrayref | keys $arrayref | + | values %$hashref | values $hashref | + | values @$arrayref | values $arrayref | + | ($k,$v) = each %$hashref | ($k,$v) = each $hashref | + | ($k,$v) = each @$arrayref | ($k,$v) = each $arrayref | + |----------------------------+---------------------------| + +This allows these builtin functions to act on long dereferencing chains +or on the return value of subroutines without needing to wrap them in +C<@{}> or C<%{}>: + + push @{$obj->tags}, $new_tag; # old way + push $obj->tags, $new_tag; # new way + + for ( keys %{$hoh->{genres}{artists}} ) {...} # old way + for ( keys $hoh->{genres}{artists} ) {...} # new way + +=head3 Single term prototype + +The C<+> prototype is a special alternative to C<$> that acts like +C<\[@%]> when given a literal array or hash variable, but will otherwise +force scalar context on the argument. See L<perlsub/Prototypes>. + +=head3 C<package> block syntax + +A package declaration can now contain a code block, in which case the +declaration is in scope inside that block only. So C<package Foo { ... }> +is precisely equivalent to C<{ package Foo; ... }>. It also works with +a version number in the declaration, as in C<package Foo 1.2 { ... }>, +which is its most attractive feature. See L<perlfunc>. + +=head3 Statement labels can appear in more places + +Statement labels can now occur before any type of statement or declaration, +such as C<package>. + +=head3 Stacked labels + +Multiple statement labels can now appear before a single statement. + +=head3 Uppercase X/B allowed in hexadecimal/binary literals + +Literals may now use either upper case C<0X...> or C<0B...> prefixes, +in addition to the already supported C<0x...> and C<0b...> +syntax [perl #76296]. + +C, Ruby, Python, and PHP already support this syntax, and it makes +Perl more internally consistent: a round-trip with C<eval sprintf +"%#X", 0x10> now returns C<16>, just like C<eval sprintf "%#x", 0x10>. + +=head3 Overridable tie functions + +C<tie>, C<tied> and C<untie> can now be overridden [perl #75902]. + +=head2 Exception Handling + +To make them more reliable and consistent, several changes have been made +to how C<die>, C<warn>, and C<$@> behave. + +=over + +=item * + +When an exception is thrown inside an C<eval>, the exception is no +longer at risk of being clobbered by destructor code running during unwinding. +Previously, the exception was written into C<$@> +early in the throwing process, and would be overwritten if C<eval> was +used internally in the destructor for an object that had to be freed +while exiting from the outer C<eval>. Now the exception is written +into C<$@> last thing before exiting the outer C<eval>, so the code +running immediately thereafter can rely on the value in C<$@> correctly +corresponding to that C<eval>. (C<$@> is still also set before exiting the +C<eval>, for the sake of destructors that rely on this.) + +Likewise, a C<local $@> inside an C<eval> no longer clobbers any +exception thrown in its scope. Previously, the restoration of C<$@> upon +unwinding would overwrite any exception being thrown. Now the exception +gets to the C<eval> anyway. So C<local $@> is safe before a C<die>. + +Exceptions thrown from object destructors no longer modify the C<$@> +of the surrounding context. (If the surrounding context was exception +unwinding, this used to be another way to clobber the exception being +thrown.) Previously such an exception was +sometimes emitted as a warning, and then either was +string-appended to the surrounding C<$@> or completely replaced the +surrounding C<$@>, depending on whether that exception and the surrounding +C<$@> were strings or objects. Now, an exception in this situation is +always emitted as a warning, leaving the surrounding C<$@> untouched. +In addition to object destructors, this also affects any function call +run by XS code using the C<G_KEEPERR> flag. + +=item * + +Warnings for C<warn> can now be objects in the same way as exceptions +for C<die>. If an object-based warning gets the default handling +of writing to standard error, it is stringified as before with the +filename and line number appended. But a C<$SIG{__WARN__}> handler now +receives an object-based warning as an object, where previously it +was passed the result of stringifying the object. + +=back + +=head2 Other Enhancements + +=head3 Assignment to C<$0> sets the legacy process name with prctl() on Linux + +On Linux the legacy process name is now set with L<prctl(2)>, in +addition to altering the POSIX name via C<argv[0]>, as Perl has done +since version 4.000. Now system utilities that read the legacy process +name such as I<ps>, I<top>, and I<killall> recognize the name you set when +assigning to C<$0>. The string you supply is truncated at 16 bytes; +this limitation is imposed by Linux. + +=head3 srand() now returns the seed + +This allows programs that need to have repeatable results not to have to come +up with their own seed-generating mechanism. Instead, they can use srand() +and stash the return value for future use. One example is a test program with +too many combinations to test comprehensively in the time available for +each run. It can test a random subset each time and, should there be a failure, +log the seed used for that run so this can later be used to produce the same results. + +=head3 printf-like functions understand post-1980 size modifiers + +Perl's printf and sprintf operators, and Perl's internal printf replacement +function, now understand the C90 size modifiers "hh" (C<char>), "z" +(C<size_t>), and "t" (C<ptrdiff_t>). Also, when compiled with a C99 +compiler, Perl now understands the size modifier "j" (C<intmax_t>) +(but this is not portable). + +So, for example, on any modern machine, C<sprintf("%hhd", 257)> returns "1". + +=head3 New global variable C<${^GLOBAL_PHASE}> + +A new global variable, C<${^GLOBAL_PHASE}>, has been added to allow +introspection of the current phase of the Perl interpreter. It's explained in +detail in L<perlvar/"${^GLOBAL_PHASE}"> and in +L<perlmod/"BEGIN, UNITCHECK, CHECK, INIT and END">. + +=head3 C<-d:-foo> calls C<Devel::foo::unimport> + +The syntax B<-d:foo> was extended in 5.6.1 to make B<-d:foo=bar> +equivalent to B<-MDevel::foo=bar>, which expands +internally to C<use Devel::foo 'bar'>. +Perl now allows prefixing the module name with B<->, with the same +semantics as B<-M>; that is: + +=over 4 + +=item C<-d:-foo> + +Equivalent to B<-M-Devel::foo>: expands to +C<no Devel::foo> and calls C<< Devel::foo->unimport() >> +if that method exists. + +=item C<-d:-foo=bar> + +Equivalent to B<-M-Devel::foo=bar>: expands to C<no Devel::foo 'bar'>, +and calls C<< Devel::foo->unimport("bar") >> if that method exists. + +=back + +This is particularly useful for suppressing the default actions of a +C<Devel::*> module's C<import> method whilst still loading it for debugging. + +=head3 Filehandle method calls load L<IO::File> on demand + +When a method call on a filehandle would die because the method cannot +be resolved and L<IO::File> has not been loaded, Perl now loads L<IO::File> +via C<require> and attempts method resolution again: + + open my $fh, ">", $file; + $fh->binmode(":raw"); # loads IO::File and succeeds + +This also works for globs like C<STDOUT>, C<STDERR>, and C<STDIN>: + + STDOUT->autoflush(1); + +Because this on-demand load happens only if method resolution fails, the +legacy approach of manually loading an L<IO::File> parent class for partial +method support still works as expected: + + use IO::Handle; + open my $fh, ">", $file; + $fh->autoflush(1); # IO::File not loaded + +=head3 Improved IPv6 support + +The C<Socket> module provides new affordances for IPv6, +including implementations of the C<Socket::getaddrinfo()> and +C<Socket::getnameinfo()> functions, along with related constants and a +handful of new functions. See L<Socket>. + +=head3 DTrace probes now include package name + +The C<DTrace> probes now include an additional argument, C<arg3>, which contains +the package the subroutine being entered or left was compiled in. + +For example, using the following DTrace script: + + perl$target:::sub-entry + { + printf("%s::%s\n", copyinstr(arg0), copyinstr(arg3)); + } + +and then running: + + $ perl -e 'sub test { }; test' + +C<DTrace> will print: + + main::test + +=head2 New C APIs + +See L</Internal Changes>. + +=head1 Security + +=head2 User-defined regular expression properties + +L<perlunicode/"User-Defined Character Properties"> documented that you can +create custom properties by defining subroutines whose names begin with +"In" or "Is". However, Perl did not actually enforce that naming +restriction, so C<\p{foo::bar}> could call foo::bar() if it existed. The documented +convention is now enforced. + +Also, Perl no longer allows tainted regular expressions to invoke a +user-defined property. It simply dies instead [perl #82616]. + +=head1 Incompatible Changes + +Perl 5.14.0 is not binary-compatible with any previous stable release. + +In addition to the sections that follow, see L</C API Changes>. + +=head2 Regular Expressions and String Escapes + +=head3 Inverted bracketed character classes and multi-character folds + +Some characters match a sequence of two or three characters in C</i> +regular expression matching under Unicode rules. One example is +C<LATIN SMALL LETTER SHARP S> which matches the sequence C<ss>. + + 'ss' =~ /\A[\N{LATIN SMALL LETTER SHARP S}]\z/i # Matches + +This, however, can lead to very counter-intuitive results, especially +when inverted. Because of this, Perl 5.14 does not use multi-character C</i> +matching in inverted character classes. + + 'ss' =~ /\A[^\N{LATIN SMALL LETTER SHARP S}]+\z/i # ??? + +This should match any sequences of characters that aren't the C<SHARP S> +nor what C<SHARP S> matches under C</i>. C<"s"> isn't C<SHARP S>, but +Unicode says that C<"ss"> is what C<SHARP S> matches under C</i>. So +which one "wins"? Do you fail the match because the string has C<ss> or +accept it because it has an C<s> followed by another C<s>? + +Earlier releases of Perl did allow this multi-character matching, +but due to bugs, it mostly did not work. + +=head3 \400-\777 + +In certain circumstances, C<\400>-C<\777> in regexes have behaved +differently than they behave in all other doublequote-like contexts. +Since 5.10.1, Perl has issued a deprecation warning when this happens. +Now, these literals behave the same in all doublequote-like contexts, +namely to be equivalent to C<\x{100}>-C<\x{1FF}>, with no deprecation +warning. + +Use of C<\400>-C<\777> in the command-line option B<-0> retain their +conventional meaning. They slurp whole input files; previously, this +was documented only for B<-0777>. + +Because of various ambiguities, you should use the new +C<\o{...}> construct to represent characters in octal instead. + +=head3 Most C<\p{}> properties are now immune to case-insensitive matching + +For most Unicode properties, it doesn't make sense to have them match +differently under C</i> case-insensitive matching. Doing so can lead +to unexpected results and potential security holes. For example + + m/\p{ASCII_Hex_Digit}+/i + +could previously match non-ASCII characters because of the Unicode +matching rules (although there were several bugs with this). Now +matching under C</i> gives the same results as non-C</i> matching except +for those few properties where people have come to expect differences, +namely the ones where casing is an integral part of their meaning, such +as C<m/\p{Uppercase}/i> and C<m/\p{Lowercase}/i>, both of which match +the same code points as matched by C<m/\p{Cased}/i>. +Details are in L<perlrecharclass/Unicode Properties>. + +User-defined property handlers that need to match differently under C</i> +must be changed to read the new boolean parameter passed to them, which +is non-zero if case-insensitive matching is in effect and 0 otherwise. +See L<perlunicode/User-Defined Character Properties>. + +=head3 \p{} implies Unicode semantics + +Specifying a Unicode property in the pattern indicates +that the pattern is meant for matching according to Unicode rules, the way +C<\N{I<NAME>}> does. + +=head3 Regular expressions retain their localeness when interpolated + +Regular expressions compiled under C<use locale> now retain this when +interpolated into a new regular expression compiled outside a +C<use locale>, and vice-versa. + +Previously, one regular expression interpolated into another inherited +the localeness of the surrounding regex, losing whatever state it +originally had. This is considered a bug fix, but may trip up code that +has come to rely on the incorrect behaviour. + +=head3 Stringification of regexes has changed + +Default regular expression modifiers are now notated using +C<(?^...)>. Code relying on the old stringification will fail. +This is so that when new modifiers are added, such code won't +have to keep changing each time this happens, because the stringification +will automatically incorporate the new modifiers. + +Code that needs to work properly with both old- and new-style regexes +can avoid the whole issue by using (for perls since 5.9.5; see L<re>): + + use re qw(regexp_pattern); + my ($pat, $mods) = regexp_pattern($re_ref); + +If the actual stringification is important or older Perls need to be +supported, you can use something like the following: + + # Accept both old and new-style stringification + my $modifiers = (qr/foobar/ =~ /\Q(?^/) ? "^" : "-xism"; + +And then use C<$modifiers> instead of C<-xism>. + +=head3 Run-time code blocks in regular expressions inherit pragmata + +Code blocks in regular expressions (C<(?{...})> and C<(??{...})>) previously +did not inherit pragmata (strict, warnings, etc.) if the regular expression +was compiled at run time as happens in cases like these two: + + use re "eval"; + $foo =~ $bar; # when $bar contains (?{...}) + $foo =~ /$bar(?{ $finished = 1 })/; + +This bug has now been fixed, but code that relied on the buggy behaviour +may need to be fixed to account for the correct behaviour. + +=head2 Stashes and Package Variables + +=head3 Localised tied hashes and arrays are no longed tied + +In the following: + + tie @a, ...; + { + local @a; + # here, @a is a now a new, untied array + } + # here, @a refers again to the old, tied array + +Earlier versions of Perl incorrectly tied the new local array. This has +now been fixed. This fix could however potentially cause a change in +behaviour of some code. + +=head3 Stashes are now always defined + +C<defined %Foo::> now always returns true, even when no symbols have yet been +defined in that package. + +This is a side-effect of removing a special-case kludge in the tokeniser, +added for 5.10.0, to hide side-effects of changes to the internal storage of +hashes. The fix drastically reduces hashes' memory overhead. + +Calling defined on a stash has been deprecated since 5.6.0, warned on +lexicals since 5.6.0, and warned for stashes and other package +variables since 5.12.0. C<defined %hash> has always exposed an +implementation detail: emptying a hash by deleting all entries from it does +not make C<defined %hash> false. Hence C<defined %hash> is not valid code to +determine whether an arbitrary hash is empty. Instead, use the behaviour +of an empty C<%hash> always returning false in scalar context. + +=head3 Clearing stashes + +Stash list assignment C<%foo:: = ()> used to make the stash temporarily +anonymous while it was being emptied. Consequently, any of its +subroutines referenced elsewhere would become anonymous, showing up as +"(unknown)" in C<caller>. They now retain their package names such that +C<caller> returns the original sub name if there is still a reference +to its typeglob and "foo::__ANON__" otherwise [perl #79208]. + +=head3 Dereferencing typeglobs + +If you assign a typeglob to a scalar variable: + + $glob = *foo; + +the glob that is copied to C<$glob> is marked with a special flag +indicating that the glob is just a copy. This allows subsequent +assignments to C<$glob> to overwrite the glob. The original glob, +however, is immutable. + +Some Perl operators did not distinguish between these two types of globs. +This would result in strange behaviour in edge cases: C<untie $scalar> +would not untie the scalar if the last thing assigned to it was a glob +(because it treated it as C<untie *$scalar>, which unties a handle). +Assignment to a glob slot (such as C<*$glob = \@some_array>) would simply +assign C<\@some_array> to C<$glob>. + +To fix this, the C<*{}> operator (including its C<*foo> and C<*$foo> forms) +has been modified to make a new immutable glob if its operand is a glob +copy. This allows operators that make a distinction between globs and +scalars to be modified to treat only immutable globs as globs. (C<tie>, +C<tied> and C<untie> have been left as they are for compatibility's sake, +but will warn. See L</Deprecations>.) + +This causes an incompatible change in code that assigns a glob to the +return value of C<*{}> when that operator was passed a glob copy. Take the +following code, for instance: + + $glob = *foo; + *$glob = *bar; + +The C<*$glob> on the second line returns a new immutable glob. That new +glob is made an alias to C<*bar>. Then it is discarded. So the second +assignment has no effect. + +See L<http://rt.perl.org/rt3/Public/Bug/Display.html?id=77810> for +more detail. + +=head3 Magic variables outside the main package + +In previous versions of Perl, magic variables like C<$!>, C<%SIG>, etc. would +"leak" into other packages. So C<%foo::SIG> could be used to access signals, +C<${"foo::!"}> (with strict mode off) to access C's C<errno>, etc. + +This was a bug, or an "unintentional" feature, which caused various ill effects, +such as signal handlers being wiped when modules were loaded, etc. + +This has been fixed (or the feature has been removed, depending on how you see +it). + +=head3 local($_) strips all magic from $_ + +local() on scalar variables gives them a new value but keeps all +their magic intact. This has proven problematic for the default +scalar variable $_, where L<perlsub> recommends that any subroutine +that assigns to $_ should first localize it. This would throw an +exception if $_ is aliased to a read-only variable, and could in general have +various unintentional side-effects. + +Therefore, as an exception to the general rule, local($_) will not +only assign a new value to $_, but also remove all existing magic from +it as well. + +=head3 Parsing of package and variable names + +Parsing the names of packages and package variables has changed: +multiple adjacent pairs of colons, as in C<foo::::bar>, are now all +treated as package separators. + +Regardless of this change, the exact parsing of package separators has +never been guaranteed and is subject to change in future Perl versions. + +=head2 Changes to Syntax or to Perl Operators + +=head3 C<given> return values + +C<given> blocks now return the last evaluated +expression, or an empty list if the block was exited by C<break>. Thus you +can now write: + + my $type = do { + given ($num) { + break when undef; + "integer" when /^[+-]?[0-9]+$/; + "float" when /^[+-]?[0-9]+(?:\.[0-9]+)?$/; + "unknown"; + } + }; + +See L<perlsyn/Return value> for details. + +=head3 Change in parsing of certain prototypes + +Functions declared with the following prototypes now behave correctly as unary +functions: + + * + \$ \% \@ \* \& + \[...] + ;$ ;* + ;\$ ;\% etc. + ;\[...] + +Due to this bug fix [perl #75904], functions +using the C<(*)>, C<(;$)> and C<(;*)> prototypes +are parsed with higher precedence than before. So +in the following example: + + sub foo(;$); + foo $a < $b; + +the second line is now parsed correctly as C<< foo($a) < $b >>, rather than +C<< foo($a < $b) >>. This happens when one of these operators is used in +an unparenthesised argument: + + < > <= >= lt gt le ge + == != <=> eq ne cmp ~~ + & + | ^ + && + || // + .. ... + ?: + = += -= *= etc. + , => + +=head3 Smart-matching against array slices + +Previously, the following code resulted in a successful match: + + my @a = qw(a y0 z); + my @b = qw(a x0 z); + @a[0 .. $#b] ~~ @b; + +This odd behaviour has now been fixed [perl #77468]. + +=head3 Negation treats strings differently from before + +The unary negation operator, C<->, now treats strings that look like numbers +as numbers [perl #57706]. + +=head3 Negative zero + +Negative zero (-0.0), when converted to a string, now becomes "0" on all +platforms. It used to become "-0" on some, but "0" on others. + +If you still need to determine whether a zero is negative, use +C<sprintf("%g", $zero) =~ /^-/> or the L<Data::Float> module on CPAN. + +=head3 C<:=> is now a syntax error + +Previously C<my $pi := 4> was exactly equivalent to C<my $pi : = 4>, +with the C<:> being treated as the start of an attribute list, ending before +the C<=>. The use of C<:=> to mean C<: => was deprecated in 5.12.0, and is +now a syntax error. This allows future use of C<:=> as a new token. + +Outside the core's tests for it, we find no Perl 5 code on CPAN +using this construction, so we believe that this change will have +little impact on real-world codebases. + +If it is absolutely necessary to have empty attribute lists (for example, +because of a code generator), simply avoid the error by adding a space before +the C<=>. + +=head3 Change in the parsing of identifiers + +Characters outside the Unicode "XIDStart" set are no longer allowed at the +beginning of an identifier. This means that certain accents and marks +that normally follow an alphabetic character may no longer be the first +character of an identifier. + +=head2 Threads and Processes + +=head3 Directory handles not copied to threads + +On systems other than Windows that do not have +a C<fchdir> function, newly-created threads no +longer inherit directory handles from their parent threads. Such programs +would usually have crashed anyway [perl #75154]. + +=head3 C<close> on shared pipes + +To avoid deadlocks, the C<close> function no longer waits for the +child process to exit if the underlying file descriptor is still +in use by another thread. It returns true in such cases. + +=head3 fork() emulation will not wait for signalled children + +On Windows parent processes would not terminate until all forked +children had terminated first. However, C<kill("KILL", ...)> is +inherently unstable on pseudo-processes, and C<kill("TERM", ...)> +might not get delivered if the child is blocked in a system call. + +To avoid the deadlock and still provide a safe mechanism to terminate +the hosting process, Perl now no longer waits for children that +have been sent a SIGTERM signal. It is up to the parent process to +waitpid() for these children if child-cleanup processing must be +allowed to finish. However, it is also then the responsibility of the +parent to avoid the deadlock by making sure the child process +can't be blocked on I/O. + +See L<perlfork> for more information about the fork() emulation on +Windows. + +=head2 Configuration + +=head3 Naming fixes in Policy_sh.SH may invalidate Policy.sh + +Several long-standing typos and naming confusions in F<Policy_sh.SH> have +been fixed, standardizing on the variable names used in F<config.sh>. + +This will change the behaviour of F<Policy.sh> if you happen to have been +accidentally relying on its incorrect behaviour. + +=head3 Perl source code is read in text mode on Windows + +Perl scripts used to be read in binary mode on Windows for the benefit +of the L<ByteLoader> module (which is no longer part of core Perl). This +had the side-effect of breaking various operations on the C<DATA> filehandle, +including seek()/tell(), and even simply reading from C<DATA> after filehandles +have been flushed by a call to system(), backticks, fork() etc. + +The default build options for Windows have been changed to read Perl source +code on Windows in text mode now. L<ByteLoader> will (hopefully) be updated on +CPAN to automatically handle this situation [perl #28106]. + +=head1 Deprecations + +See also L</Deprecated C APIs>. + +=head2 Omitting a space between a regular expression and subsequent word + +Omitting the space between a regular expression operator or +its modifiers and the following word is deprecated. For +example, C<< m/foo/sand $bar >> is for now still parsed +as C<< m/foo/s and $bar >>, but will now issue a warning. + +=head2 C<\cI<X>> + +The backslash-c construct was designed as a way of specifying +non-printable characters, but there were no restrictions (on ASCII +platforms) on what the character following the C<c> could be. Now, +a deprecation warning is raised if that character isn't an ASCII character. +Also, a deprecation warning is raised for C<"\c{"> (which is the same +as simply saying C<";">). + +=head2 C<"\b{"> and C<"\B{"> + +In regular expressions, a literal C<"{"> immediately following a C<"\b"> +(not in a bracketed character class) or a C<"\B{"> is now deprecated +to allow for its future use by Perl itself. + +=head2 Perl 4-era .pl libraries + +Perl bundles a handful of library files that predate Perl 5. +This bundling is now deprecated for most of these files, which are now +available from CPAN. The affected files now warn when run, if they were +installed as part of the core. + +This is a mandatory warning, not obeying B<-X> or lexical warning bits. +The warning is modelled on that supplied by F<deprecate.pm> for +deprecated-in-core F<.pm> libraries. It points to the specific CPAN +distribution that contains the F<.pl> libraries. The CPAN versions, of +course, do not generate the warning. + +=head2 List assignment to C<$[> + +Assignment to C<$[> was deprecated and started to give warnings in +Perl version 5.12.0. This version of Perl (5.14) now also emits a warning +when assigning to C<$[> in list context. This fixes an oversight in 5.12.0. + +=head2 Use of qw(...) as parentheses + +Historically the parser fooled itself into thinking that C<qw(...)> literals +were always enclosed in parentheses, and as a result you could sometimes omit +parentheses around them: + + for $x qw(a b c) { ... } + +The parser no longer lies to itself in this way. Wrap the list literal in +parentheses like this: + + for $x (qw(a b c)) { ... } + +This is being deprecated because the parentheses in C<for $i (1,2,3) { ... }> +are not part of expression syntax. They are part of the statement +syntax, with the C<for> statement wanting literal parentheses. +The synthetic parentheses that a C<qw> expression acquired were only +intended to be treated as part of expression syntax. + +Note that this does not change the behaviour of cases like: + + use POSIX qw(setlocale localeconv); + our @EXPORT = qw(foo bar baz); + +where parentheses were never required around the expression. + +=head2 C<\N{BELL}> + +This is because Unicode is using that name for a different character. +See L</Unicode Version 6.0 is now supported (mostly)> for more +explanation. + +=head2 C<?PATTERN?> + +C<?PATTERN?> (without the initial C<m>) has been deprecated and now produces +a warning. This is to allow future use of C<?> in new operators. +The match-once functionality is still available as C<m?PATTERN?>. + +=head2 Tie functions on scalars holding typeglobs + +Calling a tie function (C<tie>, C<tied>, C<untie>) with a scalar argument +acts on a filehandle if the scalar happens to hold a typeglob. + +This is a long-standing bug that will be removed in Perl 5.16, as +there is currently no way to tie the scalar itself when it holds +a typeglob, and no way to untie a scalar that has had a typeglob +assigned to it. + +Now there is a deprecation warning whenever a tie +function is used on a handle without an explicit C<*>. + +=head2 User-defined case-mapping + +This feature is being deprecated due to its many issues, as documented in +L<perlunicode/User-Defined Case Mappings (for serious hackers only)>. +This feature will be removed in Perl 5.16. Instead use the CPAN module +L<Unicode::Casing>, which provides improved functionality. + +=head2 Deprecated modules + +The following module will be removed from the core distribution in a +future release, and should be installed from CPAN instead. Distributions +on CPAN that require this should add it to their prerequisites. The +core version of these module now issues a deprecation warning. + +If you ship a packaged version of Perl, either alone or as part of a +larger system, then you should carefully consider the repercussions of +core module deprecations. You may want to consider shipping your default +build of Perl with a package for the deprecated module that +installs into C<vendor> or C<site> Perl library directories. This will +inhibit the deprecation warnings. + +Alternatively, you may want to consider patching F<lib/deprecate.pm> +to provide deprecation warnings specific to your packaging system +or distribution of Perl, consistent with how your packaging system +or distribution manages a staged transition from a release where the +installation of a single package provides the given functionality, to +a later release where the system administrator needs to know to install +multiple packages to get that same functionality. + +You can silence these deprecation warnings by installing the module +in question from CPAN. To install the latest version of it by role +rather than by name, just install C<Task::Deprecations::5_14>. + +=over + +=item L<Devel::DProf> + +We strongly recommend that you install and use L<Devel::NYTProf> instead +of L<Devel::DProf>, as L<Devel::NYTProf> offers significantly +improved profiling and reporting. + +=back + +=head1 Performance Enhancements + +=head2 "Safe signals" optimisation + +Signal dispatch has been moved from the runloop into control ops. +This should give a few percent speed increase, and eliminates nearly +all the speed penalty caused by the introduction of "safe signals" +in 5.8.0. Signals should still be dispatched within the same +statement as they were previously. If this does I<not> happen, or +if you find it possible to create uninterruptible loops, this is a +bug, and reports are encouraged of how to recreate such issues. + +=head2 Optimisation of shift() and pop() calls without arguments + +Two fewer OPs are used for shift() and pop() calls with no argument (with +implicit C<@_>). This change makes shift() 5% faster than C<shift @_> +on non-threaded perls, and 25% faster on threaded ones. + +=head2 Optimisation of regexp engine string comparison work + +The C<foldEQ_utf8> API function for case-insensitive comparison of strings (which +is used heavily by the regexp engine) was substantially refactored and +optimised -- and its documentation much improved as a free bonus. + +=head2 Regular expression compilation speed-up + +Compiling regular expressions has been made faster when upgrading +the regex to utf8 is necessary but this isn't known when the compilation begins. + +=head2 String appending is 100 times faster + +When doing a lot of string appending, perls built to use the system's +C<malloc> could end up allocating a lot more memory than needed in a +inefficient way. + +C<sv_grow>, the function used to allocate more memory if necessary +when appending to a string, has been taught to round up the memory +it requests to a certain geometric progression, making it much faster on +certain platforms and configurations. On Win32, it's now about 100 times +faster. + +=head2 Eliminate C<PL_*> accessor functions under ithreads + +When C<MULTIPLICITY> was first developed, and interpreter state moved into +an interpreter struct, thread- and interpreter-local C<PL_*> variables +were defined as macros that called accessor functions (returning the +address of the value) outside the Perl core. The intent was to allow +members within the interpreter struct to change size without breaking +binary compatibility, so that bug fixes could be merged to a maintenance +branch that necessitated such a size change. This mechanism was redundant +and penalised well-behaved code. It has been removed. + +=head2 Freeing weak references + +When there are many weak references to an object, freeing that object +can under some circumstances take O(I<N*N>) time to free, where +I<N> is the number of references. The circumstances in which this can happen +have been reduced [perl #75254] + +=head2 Lexical array and hash assignments + +An earlier optimisation to speed up C<my @array = ...> and +C<my %hash = ...> assignments caused a bug and was disabled in Perl 5.12.0. + +Now we have found another way to speed up these assignments [perl #82110]. + +=head2 C<@_> uses less memory + +Previously, C<@_> was allocated for every subroutine at compile time with +enough space for four entries. Now this allocation is done on demand when +the subroutine is called [perl #72416]. + +=head2 Size optimisations to SV and HV structures + +C<xhv_fill> has been eliminated from C<struct xpvhv>, saving 1 IV per hash and +on some systems will cause C<struct xpvhv> to become cache-aligned. To avoid +this memory saving causing a slowdown elsewhere, boolean use of C<HvFILL> +now calls C<HvTOTALKEYS> instead (which is equivalent), so while the fill +data when actually required are now calculated on demand, cases when +this needs to be done should be rare. + +The order of structure elements in SV bodies has changed. Effectively, +the NV slot has swapped location with STASH and MAGIC. As all access to +SV members is via macros, this should be completely transparent. This +change allows the space saving for PVHVs documented above, and may reduce +the memory allocation needed for PVIVs on some architectures. + +C<XPV>, C<XPVIV>, and C<XPVNV> now allocate only the parts of the C<SV> body +they actually use, saving some space. + +Scalars containing regular expressions now allocate only the part of the C<SV> +body they actually use, saving some space. + +=head2 Memory consumption improvements to Exporter + +The C<@EXPORT_FAIL> AV is no longer created unless needed, hence neither is +the typeglob backing it. This saves about 200 bytes for every package that +uses Exporter but doesn't use this functionality. + +=head2 Memory savings for weak references + +For weak references, the common case of just a single weak reference +per referent has been optimised to reduce the storage required. In this +case it saves the equivalent of one small Perl array per referent. + +=head2 C<%+> and C<%-> use less memory + +The bulk of the C<Tie::Hash::NamedCapture> module used to be in the Perl +core. It has now been moved to an XS module to reduce overhead for +programs that do not use C<%+> or C<%->. + +=head2 Multiple small improvements to threads + +The internal structures of threading now make fewer API calls and fewer +allocations, resulting in noticeably smaller object code. Additionally, +many thread context checks have been deferred so they're done only +as needed (although this is only possible for non-debugging builds). + +=head2 Adjacent pairs of nextstate opcodes are now optimized away + +Previously, in code such as + + use constant DEBUG => 0; + + sub GAK { + warn if DEBUG; + print "stuff\n"; + } + +the ops for C<warn if DEBUG> would be folded to a C<null> op (C<ex-const>), but +the C<nextstate> op would remain, resulting in a runtime op dispatch of +C<nextstate>, C<nextstate>, etc. + +The execution of a sequence of C<nextstate> ops is indistinguishable from just +the last C<nextstate> op so the peephole optimizer now eliminates the first of +a pair of C<nextstate> ops except when the first carries a label, since labels +must not be eliminated by the optimizer, and label usage isn't conclusively known +at compile time. + +=head1 Modules and Pragmata + +=head2 New Modules and Pragmata + +=over 4 + +=item * + +L<CPAN::Meta::YAML> 0.003 has been added as a dual-life module. It supports a +subset of YAML sufficient for reading and writing F<META.yml> and F<MYMETA.yml> files +included with CPAN distributions or generated by the module installation +toolchain. It should not be used for any other general YAML parsing or +generation task. + +=item * + +L<CPAN::Meta> version 2.110440 has been added as a dual-life module. It +provides a standard library to read, interpret and write CPAN distribution +metadata files (like F<META.json> and F<META.yml)> that describe a +distribution, its contents, and the requirements for building it and +installing it. The latest CPAN distribution metadata specification is +included as L<CPAN::Meta::Spec> and notes on changes in the specification +over time are given in L<CPAN::Meta::History>. + +=item * + +L<HTTP::Tiny> 0.012 has been added as a dual-life module. It is a very +small, simple HTTP/1.1 client designed for simple GET requests and file +mirroring. It has been added so that F<CPAN.pm> and L<CPANPLUS> can +"bootstrap" HTTP access to CPAN using pure Perl without relying on external +binaries like L<curl(1)> or L<wget(1)>. + +=item * + +L<JSON::PP> 2.27105 has been added as a dual-life module to allow CPAN +clients to read F<META.json> files in CPAN distributions. + +=item * + +L<Module::Metadata> 1.000004 has been added as a dual-life module. It gathers +package and POD information from Perl module files. It is a standalone module +based on L<Module::Build::ModuleInfo> for use by other module installation +toolchain components. L<Module::Build::ModuleInfo> has been deprecated in +favor of this module instead. + +=item * + +L<Perl::OSType> 1.002 has been added as a dual-life module. It maps Perl +operating system names (like "dragonfly" or "MSWin32") to more generic types +with standardized names (like "Unix" or "Windows"). It has been refactored +out of L<Module::Build> and L<ExtUtils::CBuilder> and consolidates such mappings into +a single location for easier maintenance. + +=item * + +The following modules were added by the L<Unicode::Collate> +upgrade. See below for details. + +L<Unicode::Collate::CJK::Big5> + +L<Unicode::Collate::CJK::GB2312> + +L<Unicode::Collate::CJK::JISX0208> + +L<Unicode::Collate::CJK::Korean> + +L<Unicode::Collate::CJK::Pinyin> + +L<Unicode::Collate::CJK::Stroke> + +=item * + +L<Version::Requirements> version 0.101020 has been added as a dual-life +module. It provides a standard library to model and manipulates module +prerequisites and version constraints defined in L<CPAN::Meta::Spec>. + +=back + +=head2 Updated Modules and Pragma + +=over 4 + +=item * + +L<attributes> has been upgraded from version 0.12 to 0.14. + +=item * + +L<Archive::Extract> has been upgraded from version 0.38 to 0.48. + +Updates since 0.38 include: a safe print method that guards +L<Archive::Extract> from changes to C<$\>; a fix to the tests when run in core +Perl; support for TZ files; a modification for the lzma +logic to favour L<IO::Uncompress::Unlzma>; and a fix +for an issue with NetBSD-current and its new L<unzip(1)> +executable. + +=item * + +L<Archive::Tar> has been upgraded from version 1.54 to 1.76. + +Important changes since 1.54 include the following: + +=over + +=item * + +Compatibility with busybox implementations of L<tar(1)>. + +=item * + +A fix so that write() and create_archive() +close only filehandles they themselves opened. + +=item * + +A bug was fixed regarding the exit code of extract_archive. + +=item * + +The L<ptar(1)> utility has a new option to allow safe creation of +tarballs without world-writable files on Windows, allowing those +archives to be uploaded to CPAN. + +=item * + +A new L<ptargrep(1)> utility for using regular expressions against +the contents of files in a tar archive. + +=item * + +L<pax> extended headers are now skipped. + +=back + +=item * + +L<Attribute::Handlers> has been upgraded from version 0.87 to 0.89. + +=item * + +L<autodie> has been upgraded from version 2.06_01 to 2.1001. + +=item * + +L<AutoLoader> has been upgraded from version 5.70 to 5.71. + +=item * + +The L<B> module has been upgraded from version 1.23 to 1.29. + +It no longer crashes when taking apart a C<y///> containing characters +outside the octet range or compiled in a C<use utf8> scope. + +The size of the shared object has been reduced by about 40%, with no +reduction in functionality. + +=item * + +L<B::Concise> has been upgraded from version 0.78 to 0.83. + +L<B::Concise> marks rv2sv(), rv2av(), and rv2hv() ops with the new +C<OPpDEREF> flag as "DREFed". + +It no longer produces mangled output with the B<-tree> option +[perl #80632]. + +=item * + +L<B::Debug> has been upgraded from version 1.12 to 1.16. + +=item * + +L<B::Deparse> has been upgraded from version 0.96 to 1.03. + +The deparsing of a C<nextstate> op has changed when it has both a +change of package relative to the previous nextstate, or a change of +C<%^H> or other state and a label. The label was previously emitted +first, but is now emitted last (5.12.1). + +The C<no 5.13.2> or similar form is now correctly handled by L<B::Deparse> +(5.12.3). + +L<B::Deparse> now properly handles the code that applies a conditional +pattern match against implicit C<$_> as it was fixed in [perl #20444]. + +Deparsing of C<our> followed by a variable with funny characters +(as permitted under the C<use utf8> pragma) has also been fixed [perl #33752]. + +=item * + +L<B::Lint> has been upgraded from version 1.11_01 to 1.13. + +=item * + +L<base> has been upgraded from version 2.15 to 2.16. + +=item * + +L<Benchmark> has been upgraded from version 1.11 to 1.12. + +=item * + +L<bignum> has been upgraded from version 0.23 to 0.27. + +=item * + +L<Carp> has been upgraded from version 1.15 to 1.20. + +L<Carp> now detects incomplete L<caller()|perlfunc/"caller EXPR"> +overrides and avoids using bogus C<@DB::args>. To provide backtraces, +Carp relies on particular behaviour of the caller() builtin. +L<Carp> now detects if other code has overridden this with an +incomplete implementation, and modifies its backtrace accordingly. +Previously incomplete overrides would cause incorrect values in +backtraces (best case), or obscure fatal errors (worst case). + +This fixes certain cases of "Bizarre copy of ARRAY" caused by modules +overriding caller() incorrectly (5.12.2). + +It now also avoids using regular expressions that cause Perl to +load its Unicode tables, so as to avoid the "BEGIN not safe after +errors" error that ensue if there has been a syntax error +[perl #82854]. + +=item * + +L<CGI> has been upgraded from version 3.48 to 3.52. + +This provides the following security fixes: the MIME boundary in +multipart_init() is now random and the handling of +newlines embedded in header values has been improved. + +=item * + +L<Compress::Raw::Bzip2> has been upgraded from version 2.024 to 2.033. + +It has been updated to use L<bzip2(1)> 1.0.6. + +=item * + +L<Compress::Raw::Zlib> has been upgraded from version 2.024 to 2.033. + +=item * + +L<constant> has been upgraded from version 1.20 to 1.21. + +Unicode constants work once more. They have been broken since Perl 5.10.0 +[CPAN RT #67525]. + +=item * + +L<CPAN> has been upgraded from version 1.94_56 to 1.9600. + +Major highlights: + +=over 4 + +=item * much less configuration dialog hassle + +=item * support for F<META/MYMETA.json> + +=item * support for L<local::lib> + +=item * support for L<HTTP::Tiny> to reduce the dependency on FTP sites + +=item * automatic mirror selection + +=item * iron out all known bugs in configure_requires + +=item * support for distributions compressed with L<bzip2(1)> + +=item * allow F<Foo/Bar.pm> on the command line to mean C<Foo::Bar> + +=back + +=item * + +L<CPANPLUS> has been upgraded from version 0.90 to 0.9103. + +A change to F<cpanp-run-perl> +resolves L<RT #55964|http://rt.cpan.org/Public/Bug/Display.html?id=55964> +and L<RT #57106|http://rt.cpan.org/Public/Bug/Display.html?id=57106>, both +of which related to failures to install distributions that use +C<Module::Install::DSL> (5.12.2). + +A dependency on L<Config> was not recognised as a +core module dependency. This has been fixed. + +L<CPANPLUS> now includes support for F<META.json> and F<MYMETA.json>. + +=item * + +L<CPANPLUS::Dist::Build> has been upgraded from version 0.46 to 0.54. + +=item * + +L<Data::Dumper> has been upgraded from version 2.125 to 2.130_02. + +The indentation used to be off when C<$Data::Dumper::Terse> was set. This +has been fixed [perl #73604]. + +This upgrade also fixes a crash when using custom sort functions that might +cause the stack to change [perl #74170]. + +L<Dumpxs> no longer crashes with globs returned by C<*$io_ref> +[perl #72332]. + +=item * + +L<DB_File> has been upgraded from version 1.820 to 1.821. + +=item * + +L<DBM_Filter> has been upgraded from version 0.03 to 0.04. + +=item * + +L<Devel::DProf> has been upgraded from version 20080331.00 to 20110228.00. + +Merely loading L<Devel::DProf> now no longer triggers profiling to start. +Both C<use Devel::DProf> and C<perl -d:DProf ...> behave as before and start +the profiler. + +B<NOTE>: L<Devel::DProf> is deprecated and will be removed from a future +version of Perl. We strongly recommend that you install and use +L<Devel::NYTProf> instead, as it offers significantly improved +profiling and reporting. + +=item * + +L<Devel::Peek> has been upgraded from version 1.04 to 1.07. + +=item * + +L<Devel::SelfStubber> has been upgraded from version 1.03 to 1.05. + +=item * + +L<diagnostics> has been upgraded from version 1.19 to 1.22. + +It now renders pod links slightly better, and has been taught to find +descriptions for messages that share their descriptions with other +messages. + +=item * + +L<Digest::MD5> has been upgraded from version 2.39 to 2.51. + +It is now safe to use this module in combination with threads. + +=item * + +L<Digest::SHA> has been upgraded from version 5.47 to 5.61. + +C<shasum> now more closely mimics L<sha1sum(1)>/L<md5sum(1)>. + +C<addfile> accepts all POSIX filenames. + +New SHA-512/224 and SHA-512/256 transforms (ref. NIST Draft FIPS 180-4 +[February 2011]) + +=item * + +L<DirHandle> has been upgraded from version 1.03 to 1.04. + +=item * + +L<Dumpvalue> has been upgraded from version 1.13 to 1.16. + +=item * + +L<DynaLoader> has been upgraded from version 1.10 to 1.13. + +It fixes a buffer overflow when passed a very long file name. + +It no longer inherits from L<AutoLoader>; hence it no longer +produces weird error messages for unsuccessful method calls on classes that +inherit from L<DynaLoader> [perl #84358]. + +=item * + +L<Encode> has been upgraded from version 2.39 to 2.42. + +Now, all 66 Unicode non-characters are treated the same way U+FFFF has +always been treated: in cases when it was disallowed, all 66 are +disallowed, and in cases where it warned, all 66 warn. + +=item * + +L<Env> has been upgraded from version 1.01 to 1.02. + +=item * + +L<Errno> has been upgraded from version 1.11 to 1.13. + +The implementation of L<Errno> has been refactored to use about 55% less memory. + +On some platforms with unusual header files, like Win32 L<gcc(1)> using C<mingw64> +headers, some constants that weren't actually error numbers have been exposed +by L<Errno>. This has been fixed [perl #77416]. + +=item * + +L<Exporter> has been upgraded from version 5.64_01 to 5.64_03. + +Exporter no longer overrides C<$SIG{__WARN__}> [perl #74472] + +=item * + +L<ExtUtils::CBuilder> has been upgraded from version 0.27 to 0.280203. + +=item * + +L<ExtUtils::Command> has been upgraded from version 1.16 to 1.17. + +=item * + +L<ExtUtils::Constant> has been upgraded from 0.22 to 0.23. + +The L<AUTOLOAD> helper code generated by C<ExtUtils::Constant::ProxySubs> +can now croak() for missing constants, or generate a complete C<AUTOLOAD> +subroutine in XS, allowing simplification of many modules that use it +(L<Fcntl>, L<File::Glob>, L<GDBM_File>, L<I18N::Langinfo>, L<POSIX>, +L<Socket>). + +L<ExtUtils::Constant::ProxySubs> can now optionally push the names of all +constants onto the package's C<@EXPORT_OK>. + +=item * + +L<ExtUtils::Install> has been upgraded from version 1.55 to 1.56. + +=item * + +L<ExtUtils::MakeMaker> has been upgraded from version 6.56 to 6.57_05. + +=item * + +L<ExtUtils::Manifest> has been upgraded from version 1.57 to 1.58. + +=item * + +L<ExtUtils::ParseXS> has been upgraded from version 2.21 to 2.2210. + +=item * + +L<Fcntl> has been upgraded from version 1.06 to 1.11. + +=item * + +L<File::Basename> has been upgraded from version 2.78 to 2.82. + +=item * + +L<File::CheckTree> has been upgraded from version 4.4 to 4.41. + +=item * + +L<File::Copy> has been upgraded from version 2.17 to 2.21. + +=item * + +L<File::DosGlob> has been upgraded from version 1.01 to 1.04. + +It allows patterns containing literal parentheses: they no longer need to +be escaped. On Windows, it no longer +adds an extra F<./> to file names +returned when the pattern is a relative glob with a drive specification, +like F<C:*.pl> [perl #71712]. + +=item * + +L<File::Fetch> has been upgraded from version 0.24 to 0.32. + +L<HTTP::Lite> is now supported for the "http" scheme. + +The L<fetch(1)> utility is supported on FreeBSD, NetBSD, and +Dragonfly BSD for the C<http> and C<ftp> schemes. + +=item * + +L<File::Find> has been upgraded from version 1.15 to 1.19. + +It improves handling of backslashes on Windows, so that paths like +F<C:\dir\/file> are no longer generated [perl #71710]. + +=item * + +L<File::Glob> has been upgraded from version 1.07 to 1.12. + +=item * + +L<File::Spec> has been upgraded from version 3.31 to 3.33. + +Several portability fixes were made in L<File::Spec::VMS>: a colon is now +recognized as a delimiter in native filespecs; caret-escaped delimiters are +recognized for better handling of extended filespecs; catpath() returns +an empty directory rather than the current directory if the input directory +name is empty; and abs2rel() properly handles Unix-style input (5.12.2). + +=item * + +L<File::stat> has been upgraded from 1.02 to 1.05. + +The C<-x> and C<-X> file test operators now work correctly when run +by the superuser. + +=item * + +L<Filter::Simple> has been upgraded from version 0.84 to 0.86. + +=item * + +L<GDBM_File> has been upgraded from 1.10 to 1.14. + +This fixes a memory leak when DBM filters are used. + +=item * + +L<Hash::Util> has been upgraded from 0.07 to 0.11. + +L<Hash::Util> no longer emits spurious "uninitialized" warnings when +recursively locking hashes that have undefined values [perl #74280]. + +=item * + +L<Hash::Util::FieldHash> has been upgraded from version 1.04 to 1.09. + +=item * + +L<I18N::Collate> has been upgraded from version 1.01 to 1.02. + +=item * + +L<I18N::Langinfo> has been upgraded from version 0.03 to 0.08. + +langinfo() now defaults to using C<$_> if there is no argument given, just +as the documentation has always claimed. + +=item * + +L<I18N::LangTags> has been upgraded from version 0.35 to 0.35_01. + +=item * + +L<if> has been upgraded from version 0.05 to 0.0601. + +=item * + +L<IO> has been upgraded from version 1.25_02 to 1.25_04. + +This version of L<IO> includes a new L<IO::Select>, which now allows L<IO::Handle> +objects (and objects in derived classes) to be removed from an L<IO::Select> set +even if the underlying file descriptor is closed or invalid. + +=item * + +L<IPC::Cmd> has been upgraded from version 0.54 to 0.70. + +Resolves an issue with splitting Win32 command lines. An argument +consisting of the single character "0" used to be omitted (CPAN RT #62961). + +=item * + +L<IPC::Open3> has been upgraded from 1.05 to 1.09. + +open3() now produces an error if the C<exec> call fails, allowing this +condition to be distinguished from a child process that exited with a +non-zero status [perl #72016]. + +The internal xclose() routine now knows how to handle file descriptors as +documented, so duplicating C<STDIN> in a child process using its file +descriptor now works [perl #76474]. + +=item * + +L<IPC::SysV> has been upgraded from version 2.01 to 2.03. + +=item * + +L<lib> has been upgraded from version 0.62 to 0.63. + +=item * + +L<Locale::Maketext> has been upgraded from version 1.14 to 1.19. + +L<Locale::Maketext> now supports external caches. + +This upgrade also fixes an infinite loop in +C<Locale::Maketext::Guts::_compile()> when +working with tainted values (CPAN RT #40727). + +C<< ->maketext >> calls now back up and restore C<$@> so error +messages are not suppressed (CPAN RT #34182). + +=item * + +L<Log::Message> has been upgraded from version 0.02 to 0.04. + +=item * + +L<Log::Message::Simple> has been upgraded from version 0.06 to 0.08. + +=item * + +L<Math::BigInt> has been upgraded from version 1.89_01 to 1.994. + +This fixes, among other things, incorrect results when computing binomial +coefficients [perl #77640]. + +It also prevents C<sqrt($int)> from crashing under C<use bigrat>. +[perl #73534]. + +=item * + +L<Math::BigInt::FastCalc> has been upgraded from version 0.19 to 0.28. + +=item * + +L<Math::BigRat> has been upgraded from version 0.24 to 0.26_02. + +=item * + +L<Memoize> has been upgraded from version 1.01_03 to 1.02. + +=item * + +L<MIME::Base64> has been upgraded from 3.08 to 3.13. + +Includes new functions to calculate the length of encoded and decoded +base64 strings. + +Now provides encode_base64url() and decode_base64url() functions to process +the base64 scheme for "URL applications". + +=item * + +L<Module::Build> has been upgraded from version 0.3603 to 0.3800. + +A notable change is the deprecation of several modules. +L<Module::Build::Version> has been deprecated and L<Module::Build> now +relies on the L<version> pragma directly. L<Module::Build::ModuleInfo> has +been deprecated in favor of a standalone copy called L<Module::Metadata>. +L<Module::Build::YAML> has been deprecated in favor of L<CPAN::Meta::YAML>. + +L<Module::Build> now also generates F<META.json> and F<MYMETA.json> files +in accordance with version 2 of the CPAN distribution metadata specification, +L<CPAN::Meta::Spec>. The older format F<META.yml> and F<MYMETA.yml> files are +still generated. + +=item * + +L<Module::CoreList> has been upgraded from version 2.29 to 2.47. + +Besides listing the updated core modules of this release, it also stops listing +the C<Filespec> module. That module never existed in core. The scripts +generating L<Module::CoreList> confused it with L<VMS::Filespec>, which actually +is a core module as of Perl 5.8.7. + +=item * + +L<Module::Load> has been upgraded from version 0.16 to 0.18. + +=item * + +L<Module::Load::Conditional> has been upgraded from version 0.34 to 0.44. + +=item * + +The L<mro> pragma has been upgraded from version 1.02 to 1.07. + +=item * + +L<NDBM_File> has been upgraded from version 1.08 to 1.12. + +This fixes a memory leak when DBM filters are used. + +=item * + +L<Net::Ping> has been upgraded from version 2.36 to 2.38. + +=item * + +L<NEXT> has been upgraded from version 0.64 to 0.65. + +=item * + +L<Object::Accessor> has been upgraded from version 0.36 to 0.38. + +=item * + +L<ODBM_File> has been upgraded from version 1.07 to 1.10. + +This fixes a memory leak when DBM filters are used. + +=item * + +L<Opcode> has been upgraded from version 1.15 to 1.18. + +=item * + +The L<overload> pragma has been upgraded from 1.10 to 1.13. + +C<overload::Method> can now handle subroutines that are themselves blessed +into overloaded classes [perl #71998]. + +The documentation has greatly improved. See L</Documentation> below. + +=item * + +L<Params::Check> has been upgraded from version 0.26 to 0.28. + +=item * + +The L<parent> pragma has been upgraded from version 0.223 to 0.225. + +=item * + +L<Parse::CPAN::Meta> has been upgraded from version 1.40 to 1.4401. + +The latest Parse::CPAN::Meta can now read YAML and JSON files using +L<CPAN::Meta::YAML> and L<JSON::PP>, which are now part of the Perl core. + +=item * + +L<PerlIO::encoding> has been upgraded from version 0.12 to 0.14. + +=item * + +L<PerlIO::scalar> has been upgraded from 0.07 to 0.11. + +A read() after a seek() beyond the end of the string no longer thinks it +has data to read [perl #78716]. + +=item * + +L<PerlIO::via> has been upgraded from version 0.09 to 0.11. + +=item * + +L<Pod::Html> has been upgraded from version 1.09 to 1.11. + +=item * + +L<Pod::LaTeX> has been upgraded from version 0.58 to 0.59. + +=item * + +L<Pod::Perldoc> has been upgraded from version 3.15_02 to 3.15_03. + +=item * + +L<Pod::Simple> has been upgraded from version 3.13 to 3.16. + +=item * + +L<POSIX> has been upgraded from 1.19 to 1.24. + +It now includes constants for POSIX signal constants. + +=item * + +The L<re> pragma has been upgraded from version 0.11 to 0.18. + +The C<use re '/flags'> subpragma is new. + +The regmust() function used to crash when called on a regular expression +belonging to a pluggable engine. Now it croaks instead. + +regmust() no longer leaks memory. + +=item * + +L<Safe> has been upgraded from version 2.25 to 2.29. + +Coderefs returned by reval() and rdo() are now wrapped via +wrap_code_refs() (5.12.1). + +This fixes a possible infinite loop when looking for coderefs. + +It adds several C<version::vxs::*> routines to the default share. + +=item * + +L<SDBM_File> has been upgraded from version 1.06 to 1.09. + +=item * + +L<SelfLoader> has been upgraded from 1.17 to 1.18. + +It now works in taint mode [perl #72062]. + +=item * + +The L<sigtrap> pragma has been upgraded from version 1.04 to 1.05. + +It no longer tries to modify read-only arguments when generating a +backtrace [perl #72340]. + +=item * + +L<Socket> has been upgraded from version 1.87 to 1.94. + +See L</Improved IPv6 support> above. + +=item * + +L<Storable> has been upgraded from version 2.22 to 2.27. + +Includes performance improvement for overloaded classes. + +This adds support for serialising code references that contain UTF-8 strings +correctly. The L<Storable> minor version +number changed as a result, meaning that +L<Storable> users who set C<$Storable::accept_future_minor> to a C<FALSE> value +will see errors (see L<Storable/FORWARD COMPATIBILITY> for more details). + +Freezing no longer gets confused if the Perl stack gets reallocated +during freezing [perl #80074]. + +=item * + +L<Sys::Hostname> has been upgraded from version 1.11 to 1.16. + +=item * + +L<Term::ANSIColor> has been upgraded from version 2.02 to 3.00. + +=item * + +L<Term::UI> has been upgraded from version 0.20 to 0.26. + +=item * + +L<Test::Harness> has been upgraded from version 3.17 to 3.23. + +=item * + +L<Test::Simple> has been upgraded from version 0.94 to 0.98. + +Among many other things, subtests without a C<plan> or C<no_plan> now have an +implicit done_testing() added to them. + +=item * + +L<Thread::Semaphore> has been upgraded from version 2.09 to 2.12. + +It provides two new methods that give more control over the decrementing of +semaphores: C<down_nb> and C<down_force>. + +=item * + +L<Thread::Queue> has been upgraded from version 2.11 to 2.12. + +=item * + +The L<threads> pragma has been upgraded from version 1.75 to 1.83. + +=item * + +The L<threads::shared> pragma has been upgraded from version 1.32 to 1.37. + +=item * + +L<Tie::Hash> has been upgraded from version 1.03 to 1.04. + +Calling C<< Tie::Hash->TIEHASH() >> used to loop forever. Now it C<croak>s. + +=item * + +L<Tie::Hash::NamedCapture> has been upgraded from version 0.06 to 0.08. + +=item * + +L<Tie::RefHash> has been upgraded from version 1.38 to 1.39. + +=item * + +L<Time::HiRes> has been upgraded from version 1.9719 to 1.9721_01. + +=item * + +L<Time::Local> has been upgraded from version 1.1901_01 to 1.2000. + +=item * + +L<Time::Piece> has been upgraded from version 1.15_01 to 1.20_01. + +=item * + +L<Unicode::Collate> has been upgraded from version 0.52_01 to 0.73. + +L<Unicode::Collate> has been updated to use Unicode 6.0.0. + +L<Unicode::Collate::Locale> now supports a plethora of new locales: I<ar, be, +bg, de__phonebook, hu, hy, kk, mk, nso, om, tn, vi, hr, ig, ja, ko, ru, sq, +se, sr, to, uk, zh, zh__big5han, zh__gb2312han, zh__pinyin>, and I<zh__stroke>. + +The following modules have been added: + +L<Unicode::Collate::CJK::Big5> for C<zh__big5han> which makes +tailoring of CJK Unified Ideographs in the order of CLDR's big5han ordering. + +L<Unicode::Collate::CJK::GB2312> for C<zh__gb2312han> which makes +tailoring of CJK Unified Ideographs in the order of CLDR's gb2312han ordering. + +L<Unicode::Collate::CJK::JISX0208> which makes tailoring of 6355 kanji +(CJK Unified Ideographs) in the JIS X 0208 order. + +L<Unicode::Collate::CJK::Korean> which makes tailoring of CJK Unified Ideographs +in the order of CLDR's Korean ordering. + +L<Unicode::Collate::CJK::Pinyin> for C<zh__pinyin> which makes +tailoring of CJK Unified Ideographs in the order of CLDR's pinyin ordering. + +L<Unicode::Collate::CJK::Stroke> for C<zh__stroke> which makes +tailoring of CJK Unified Ideographs in the order of CLDR's stroke ordering. + +This also sees the switch from using the pure-Perl version of this +module to the XS version. + +=item * + +L<Unicode::Normalize> has been upgraded from version 1.03 to 1.10. + +=item * + +L<Unicode::UCD> has been upgraded from version 0.27 to 0.32. + +A new function, Unicode::UCD::num(), has been added. This function +returns the numeric value of the string passed it or C<undef> if the string +in its entirety has no "safe" numeric value. (For more detail, and for the +definition of "safe", see L<Unicode::UCD/num()>.) + +This upgrade also includes several bug fixes: + +=over 4 + +=item charinfo() + +=over 4 + +=item * + +It is now updated to Unicode Version 6.0.0 with I<Corrigendum #8>, +excepting that, just as with Perl 5.14, the code point at U+1F514 has no name. + +=item * + +Hangul syllable code points have the correct names, and their +decompositions are always output without requiring L<Lingua::KO::Hangul::Util> +to be installed. + +=item * + +CJK (Chinese-Japanese-Korean) code points U+2A700 to U+2B734 +and U+2B740 to U+2B81D are now properly handled. + +=item * + +Numeric values are now output for those CJK code points that have them. + +=item * + +Names output for code points with multiple aliases are now the +corrected ones. + +=back + +=item charscript() + +This now correctly returns "Unknown" instead of C<undef> for the script +of a code point that hasn't been assigned another one. + +=item charblock() + +This now correctly returns "No_Block" instead of C<undef> for the block +of a code point that hasn't been assigned to another one. + +=back + +=item * + +The L<version> pragma has been upgraded from 0.82 to 0.88. + +Because of a bug, now fixed, the is_strict() and is_lax() functions did not +work when exported (5.12.1). + +=item * + +The L<warnings> pragma has been upgraded from version 1.09 to 1.12. + +Calling C<use warnings> without arguments is now significantly more efficient. + +=item * + +The L<warnings::register> pragma has been upgraded from version 1.01 to 1.02. + +It is now possible to register warning categories other than the names of +packages using L<warnings::register>. See L<perllexwarn(1)> for more information. + +=item * + +L<XSLoader> has been upgraded from version 0.10 to 0.13. + +=item * + +L<VMS::DCLsym> has been upgraded from version 1.03 to 1.05. + +Two bugs have been fixed [perl #84086]: + +The symbol table name was lost when tying a hash, due to a thinko in +C<TIEHASH>. The result was that all tied hashes interacted with the +local symbol table. + +Unless a symbol table name had been explicitly specified in the call +to the constructor, querying the special key C<:LOCAL> failed to +identify objects connected to the local symbol table. + +=item * + +The L<Win32> module has been upgraded from version 0.39 to 0.44. + +This release has several new functions: Win32::GetSystemMetrics(), +Win32::GetProductInfo(), Win32::GetOSDisplayName(). + +The names returned by Win32::GetOSName() and Win32::GetOSDisplayName() +have been corrected. + +=item * + +L<XS::Typemap> has been upgraded from version 0.03 to 0.05. + +=back + +=head2 Removed Modules and Pragmata + +As promised in Perl 5.12.0's release notes, the following modules have +been removed from the core distribution, and if needed should be installed +from CPAN instead. + +=over + +=item * + +L<Class::ISA> has been removed from the Perl core. Prior version was 0.36. + +=item * + +L<Pod::Plainer> has been removed from the Perl core. Prior version was 1.02. + +=item * + +L<Switch> has been removed from the Perl core. Prior version was 2.16. + +=back + +The removal of L<Shell> has been deferred until after 5.14, as the +implementation of L<Shell> shipped with 5.12.0 did not correctly issue the +warning that it was to be removed from core. + +=head1 Documentation + +=head2 New Documentation + +=head3 L<perlgpl> + +L<perlgpl> has been updated to contain GPL version 1, as is included in the +F<README> distributed with Perl (5.12.1). + +=head3 Perl 5.12.x delta files + +The perldelta files for Perl 5.12.1 to 5.12.3 have been added from the +maintenance branch: L<perl5121delta>, L<perl5122delta>, L<perl5123delta>. + +=head3 L<perlpodstyle> + +New style guide for POD documentation, +split mostly from the NOTES section of the L<pod2man(1)> manpage. + +=head3 L<perlsource>, L<perlinterp>, L<perlhacktut>, and L<perlhacktips> + +See L</perlhack and perlrepository revamp>, below. + +=head2 Changes to Existing Documentation + +=head3 L<perlmodlib> is now complete + +The L<perlmodlib> manpage that came with Perl 5.12.0 was missing several +modules due to a bug in the script that generates the list. This has been +fixed [perl #74332] (5.12.1). + +=head3 Replace incorrect tr/// table in L<perlebcdic> + +L<perlebcdic> contains a helpful table to use in C<tr///> to convert +between EBCDIC and Latin1/ASCII. The table was the inverse of the one +it describes, though the code that used the table worked correctly for +the specific example given. + +The table has been corrected and the sample code changed to correspond. + +The table has also been changed to hex from octal, and the recipes in the +pod have been altered to print out leading zeros to make all values +the same length. + +=head3 Tricks for user-defined casing + +L<perlunicode> now contains an explanation of how to override, mangle +and otherwise tweak the way Perl handles upper-, lower- and other-case +conversions on Unicode data, and how to provide scoped changes to alter +one's own code's behaviour without stomping on anybody else's. + +=head3 INSTALL explicitly states that Perl requires a C89 compiler + +This was already true, but it's now Officially Stated For The Record +(5.12.2). + +=head3 Explanation of C<\xI<HH>> and C<\oI<OOO>> escapes + +L<perlop> has been updated with more detailed explanation of these two +character escapes. + +=head3 B<-0I<NNN>> switch + +In L<perlrun>, the behaviour of the B<-0NNN> switch for B<-0400> or higher +has been clarified (5.12.2). + +=head3 Maintenance policy + +L<perlpolicy> now contains the policy on what patches are acceptable for +maintenance branches (5.12.1). + +=head3 Deprecation policy + +L<perlpolicy> now contains the policy on compatibility and deprecation +along with definitions of terms like "deprecation" (5.12.2). + +=head3 New descriptions in L<perldiag> + +The following existing diagnostics are now documented: + +=over 4 + +=item * + +L<Ambiguous use of %c resolved as operator %c|perldiag/"Ambiguous use of %c resolved as operator %c"> + +=item * + +L<Ambiguous use of %c{%s} resolved to %c%s|perldiag/"Ambiguous use of %c{%s} resolved to %c%s"> + +=item * + +L<Ambiguous use of %c{%s[...]} resolved to %c%s[...]|perldiag/"Ambiguous use of %c{%s[...]} resolved to %c%s[...]"> + +=item * + +L<Ambiguous use of %c{%s{...}} resolved to %c%s{...}|perldiag/"Ambiguous use of %c{%s{...}} resolved to %c%s{...}"> + +=item * + +L<Ambiguous use of -%s resolved as -&%s()|perldiag/"Ambiguous use of -%s resolved as -&%s()"> + +=item * + +L<Invalid strict version format (%s)|perldiag/"Invalid strict version format (%s)"> + +=item * + +L<Invalid version format (%s)|perldiag/"Invalid version format (%s)"> + +=item * + +L<Invalid version object|perldiag/"Invalid version object"> + +=back + +=head3 L<perlbook> + +L<perlbook> has been expanded to cover many more popular books. + +=head3 C<SvTRUE> macro + +The documentation for the C<SvTRUE> macro in +L<perlapi> was simply wrong in stating that +get-magic is not processed. It has been corrected. + +=head3 op manipulation functions + +Several API functions that process optrees have been newly documented. + +=head3 L<perlvar> revamp + +L<perlvar> reorders the variables and groups them by topic. Each variable +introduced after Perl 5.000 notes the first version in which it is +available. L<perlvar> also has a new section for deprecated variables to +note when they were removed. + +=head3 Array and hash slices in scalar context + +These are now documented in L<perldata>. + +=head3 C<use locale> and formats + +L<perlform> and L<perllocale> have been corrected to state that +C<use locale> affects formats. + +=head3 L<overload> + +L<overload>'s documentation has practically undergone a rewrite. It +is now much more straightforward and clear. + +=head3 perlhack and perlrepository revamp + +The L<perlhack> document is now much shorter, and focuses on the Perl 5 +development process and submitting patches to Perl. The technical content +has been moved to several new documents, L<perlsource>, L<perlinterp>, +L<perlhacktut>, and L<perlhacktips>. This technical content has +been only lightly edited. + +The perlrepository document has been renamed to L<perlgit>. This new +document is just a how-to on using git with the Perl source code. +Any other content that used to be in perlrepository has been moved +to L<perlhack>. + +=head3 Time::Piece examples + +Examples in L<perlfaq4> have been updated to show the use of +L<Time::Piece>. + +=head1 Diagnostics + +The following additions or changes have been made to diagnostic output, +including warnings and fatal error messages. For the complete list of +diagnostic messages, see L<perldiag>. + +=head2 New Diagnostics + +=head3 New Errors + +=over + +=item Closure prototype called + +This error occurs when a subroutine reference passed to an attribute +handler is called, if the subroutine is a closure [perl #68560]. + +=item Insecure user-defined property %s + +Perl detected tainted data when trying to compile a regular +expression that contains a call to a user-defined character property +function, meaning C<\p{IsFoo}> or C<\p{InFoo}>. +See L<perlunicode/User-Defined Character Properties> and L<perlsec>. + +=item panic: gp_free failed to free glob pointer - something is repeatedly re-creating entries + +This new error is triggered if a destructor called on an object in a +typeglob that is being freed creates a new typeglob entry containing an +object with a destructor that creates a new entry containing an object etc. + +=item Parsing code internal error (%s) + +This new fatal error is produced when parsing +code supplied by an extension violates the +parser's API in a detectable way. + +=item refcnt: fd %d%s + +This new error only occurs if a internal consistency check fails when a +pipe is about to be closed. + +=item Regexp modifier "/%c" may not appear twice + +The regular expression pattern has one of the +mutually exclusive modifiers repeated. + +=item Regexp modifiers "/%c" and "/%c" are mutually exclusive + +The regular expression pattern has more than one of the mutually +exclusive modifiers. + +=item Using !~ with %s doesn't make sense + +This error occurs when C<!~> is used with C<s///r> or C<y///r>. + +=back + +=head3 New Warnings + +=over + +=item "\b{" is deprecated; use "\b\{" instead + +=item "\B{" is deprecated; use "\B\{" instead + +Use of an unescaped "{" immediately following a C<\b> or C<\B> is now +deprecated in order to reserve its use for Perl itself in a future release. + +=item Operation "%s" returns its argument for ... + +Performing an operation requiring Unicode semantics (such as case-folding) +on a Unicode surrogate or a non-Unicode character now triggers this +warning. + +=item Use of qw(...) as parentheses is deprecated + +See L</"Use of qw(...) as parentheses">, above, for details. + +=back + +=head2 Changes to Existing Diagnostics + +=over 4 + +=item * + +The "Variable $foo is not imported" warning that precedes a +C<strict 'vars'> error has now been assigned the "misc" category, so that +C<no warnings> will suppress it [perl #73712]. + +=item * + +warn() and die() now produce "Wide character" warnings when fed a +character outside the byte range if C<STDERR> is a byte-sized handle. + +=item * + +The "Layer does not match this perl" error message has been replaced with +these more helpful messages [perl #73754]: + +=over 4 + +=item * + +PerlIO layer function table size (%d) does not match size expected by this +perl (%d) + +=item * + +PerlIO layer instance size (%d) does not match size expected by this perl +(%d) + +=back + +=item * + +The "Found = in conditional" warning that is emitted when a constant is +assigned to a variable in a condition is now withheld if the constant is +actually a subroutine or one generated by C<use constant>, since the value +of the constant may not be known at the time the program is written +[perl #77762]. + +=item * + +Previously, if none of the gethostbyaddr(), gethostbyname() and +gethostent() functions were implemented on a given platform, they would +all die with the message "Unsupported socket function 'gethostent' called", +with analogous messages for getnet*() and getserv*(). This has been +corrected. + +=item * + +The warning message about unrecognized regular expression escapes passed +through has been changed to include any literal "{" following the +two-character escape. For example, "\q{" is now emitted instead of "\q". + +=back + +=head1 Utility Changes + +=head3 L<perlbug(1)> + +=over 4 + +=item * + +L<perlbug> now looks in the EMAIL environment variable for a return address +if the REPLY-TO and REPLYTO variables are empty. + +=item * + +L<perlbug> did not previously generate a "From:" header, potentially +resulting in dropped mail; it now includes that header. + +=item * + +The user's address is now used as the Return-Path. + +Many systems these days don't have a valid Internet domain name, and +perlbug@perl.org does not accept email with a return-path that does +not resolve. So the user's address is now passed to sendmail so it's +less likely to get stuck in a mail queue somewhere [perl #82996]. + +=item * + +L<perlbug> now always gives the reporter a chance to change the email +address it guesses for them (5.12.2). + +=item * + +L<perlbug> should no longer warn about uninitialized values when using the B<-d> +and B<-v> options (5.12.2). + +=back + +=head3 L<perl5db.pl> + +=over + +=item * + +The remote terminal works after forking and spawns new sessions, one +per forked process. + +=back + +=head3 L<ptargrep> + +=over 4 + +=item * + +L<ptargrep> is a new utility to apply pattern matching to the contents of +files in a tar archive. It comes with C<Archive::Tar>. + +=back + +=head1 Configuration and Compilation + +See also L</"Naming fixes in Policy_sh.SH may invalidate Policy.sh">, +above. + +=over 4 + +=item * + +CCINCDIR and CCLIBDIR for the mingw64 cross-compiler are now correctly +under F<$(CCHOME)\mingw\include> and F<\lib> rather than immediately below +F<$(CCHOME)>. + +This means the "incpath", "libpth", "ldflags", "lddlflags" and +"ldflags_nolargefiles" values in F<Config.pm> and F<Config_heavy.pl> are now +set correctly. + +=item * + +C<make test.valgrind> has been adjusted to account for F<cpan/dist/ext> +separation. + +=item * + +On compilers that support it, B<-Wwrite-strings> is now added to cflags by +default. + +=item * + +The L<Encode> module can now (once again) be included in a static Perl +build. The special-case handling for this situation got broken in Perl +5.11.0, and has now been repaired. + +=item * + +The previous default size of a PerlIO buffer (4096 bytes) has been increased +to the larger of 8192 bytes and your local BUFSIZ. Benchmarks show that doubling +this decade-old default increases read and write performance by around +25% to 50% when using the default layers of perlio on top of unix. To choose +a non-default size, such as to get back the old value or to obtain an even +larger value, configure with: + + ./Configure -Accflags=-DPERLIOBUF_DEFAULT_BUFSIZ=N + +where N is the desired size in bytes; it should probably be a multiple of +your page size. + +=item * + +An "incompatible operand types" error in ternary expressions when building +with C<clang> has been fixed (5.12.2). + +=item * + +Perl now skips setuid L<File::Copy> tests on partitions it detects mounted +as C<nosuid> (5.12.2). + +=back + +=head1 Platform Support + +=head2 New Platforms + +=over 4 + +=item AIX + +Perl now builds on AIX 4.2 (5.12.1). + +=back + +=head2 Discontinued Platforms + +=over 4 + +=item Apollo DomainOS + +The last vestiges of support for this platform have been excised from +the Perl distribution. It was officially discontinued in version 5.12.0. +It had not worked for years before that. + +=item MacOS Classic + +The last vestiges of support for this platform have been excised from the +Perl distribution. It was officially discontinued in an earlier version. + +=back + +=head2 Platform-Specific Notes + +=head3 AIX + +=over + +=item * + +F<README.aix> has been updated with information about the XL C/C++ V11 compiler +suite (5.12.2). + +=back + +=head3 ARM + +=over + +=item * + +The C<d_u32align> configuration probe on ARM has been fixed (5.12.2). + +=back + +=head3 Cygwin + +=over 4 + +=item * + +L<MakeMaker> has been updated to build manpages on cygwin. + +=item * + +Improved rebase behaviour + +If a DLL is updated on cygwin the old imagebase address is reused. +This solves most rebase errors, especially when updating on core DLL's. +See L<http://www.tishler.net/jason/software/rebase/rebase-2.4.2.README> +for more information. + +=item * + +Support for the standard cygwin dll prefix (needed for FFIs) + +=item * + +Updated build hints file + +=back + +=head3 FreeBSD 7 + +=over + +=item * + +FreeBSD 7 no longer contains F</usr/bin/objformat>. At build time, +Perl now skips the F<objformat> check for versions 7 and higher and +assumes ELF (5.12.1). + +=back + +=head3 HP-UX + +=over + +=item * + +Perl now allows B<-Duse64bitint> without promoting to C<use64bitall> on HP-UX +(5.12.1). + +=back + +=head3 IRIX + +=over + +=item * + +Conversion of strings to floating-point numbers is now more accurate on +IRIX systems [perl #32380]. + +=back + +=head3 Mac OS X + +=over + +=item * + +Early versions of Mac OS X (Darwin) had buggy implementations of the +setregid(), setreuid(), setrgid(,) and setruid() functions, so Perl +would pretend they did not exist. + +These functions are now recognised on Mac OS 10.5 (Leopard; Darwin 9) and +higher, as they have been fixed [perl #72990]. + +=back + +=head3 MirBSD + +=over + +=item * + +Previously if you built Perl with a shared F<libperl.so> on MirBSD (the +default config), it would work up to the installation; however, once +installed, it would be unable to find F<libperl>. Path handling is now +treated as in the other BSD dialects. + +=back + +=head3 NetBSD + +=over + +=item * + +The NetBSD hints file has been changed to make the system malloc the +default. + +=back + +=head3 OpenBSD + +=over + +=item * + +OpenBSD E<gt> 3.7 has a new malloc implementation which is I<mmap>-based, +and as such can release memory back to the OS; however, Perl's use of +this malloc causes a substantial slowdown, so we now default to using +Perl's malloc instead [perl #75742]. + +=back + +=head3 OpenVOS + +=over + +=item * + +Perl now builds again with OpenVOS (formerly known as Stratus VOS) +[perl #78132] (5.12.3). + +=back + +=head3 Solaris + +=over + +=item * + +DTrace is now supported on Solaris. There used to be build failures, but +these have been fixed [perl #73630] (5.12.3). + +=back + +=head3 VMS + +=over + +=item * + +Extension building on older (pre 7.3-2) VMS systems was broken because +configure.com hit the DCL symbol length limit of 1K. We now work within +this limit when assembling the list of extensions in the core build (5.12.1). + +=item * + +We fixed configuring and building Perl with B<-Uuseperlio> (5.12.1). + +=item * + +C<PerlIOUnix_open> now honours the default permissions on VMS. + +When C<perlio> became the default and C<unix> became the default bottom layer, +the most common path for creating files from Perl became C<PerlIOUnix_open>, +which has always explicitly used C<0666> as the permission mask. This prevents +inheriting permissions from RMS defaults and ACLs, so to avoid that problem, +we now pass C<0777> to open(). In the VMS CRTL, C<0777> has a special +meaning over and above intersecting with the current umask; specifically, it +allows Unix syscalls to preserve native default permissions (5.12.3). + +=item * + +The shortening of symbols longer than 31 characters in the core C sources +and in extensions is now by default done by the C compiler rather than by +xsubpp (which could only do so for generated symbols in XS code). You can +reenable xsubpp's symbol shortening by configuring with -Uuseshortenedsymbols, +but you'll have some work to do to get the core sources to compile. + +=item * + +Record-oriented files (record format variable or variable with fixed control) +opened for write by the C<perlio> layer will now be line-buffered to prevent the +introduction of spurious line breaks whenever the perlio buffer fills up. + +=item * + +F<git_version.h> is now installed on VMS. This was an oversight in v5.12.0 which +caused some extensions to fail to build (5.12.2). + +=item * + +Several memory leaks in L<stat()|perlfunc/"stat FILEHANDLE"> have been fixed (5.12.2). + +=item * + +A memory leak in Perl_rename() due to a double allocation has been +fixed (5.12.2). + +=item * + +A memory leak in vms_fid_to_name() (used by realpath() and +realname()> has been fixed (5.12.2). + +=back + +=head3 Windows + +See also L</"fork() emulation will not wait for signalled children"> and +L</"Perl source code is read in text mode on Windows">, above. + +=over 4 + +=item * + +Fixed build process for SDK2003SP1 compilers. + +=item * + +Compilation with Visual Studio 2010 is now supported. + +=item * + +When using old 32-bit compilers, the define C<_USE_32BIT_TIME_T> is now +set in C<$Config{ccflags}>. This improves portability when compiling +XS extensions using new compilers, but for a Perl compiled with old 32-bit +compilers. + +=item * + +C<$Config{gccversion}> is now set correctly when Perl is built using the +mingw64 compiler from L<http://mingw64.org> [perl #73754]. + +=item * + +When building Perl with the mingw64 x64 cross-compiler C<incpath>, +C<libpth>, C<ldflags>, C<lddlflags> and C<ldflags_nolargefiles> values +in F<Config.pm> and F<Config_heavy.pl> were not previously being set +correctly because, with that compiler, the include and lib directories +are not immediately below C<$(CCHOME)> (5.12.2). + +=item * + +The build process proceeds more smoothly with mingw and dmake when +F<C:\MSYS\bin> is in the PATH, due to a C<Cwd> fix. + +=item * + +Support for building with Visual C++ 2010 is now underway, but is not yet +complete. See F<README.win32> or L<perlwin32> for more details. + +=item * + +The option to use an externally-supplied crypt(), or to build with no +crypt() at all, has been removed. Perl supplies its own crypt() +implementation for Windows, and the political situation that required +this part of the distribution to sometimes be omitted is long gone. + +=back + +=head1 Internal Changes + +=head2 New APIs + +=head3 CLONE_PARAMS structure added to ease correct thread creation + +Modules that create threads should now create C<CLONE_PARAMS> structures +by calling the new function Perl_clone_params_new(), and free them with +Perl_clone_params_del(). This will ensure compatibility with any future +changes to the internals of the C<CLONE_PARAMS> structure layout, and that +it is correctly allocated and initialised. + +=head3 New parsing functions + +Several functions have been added for parsing Perl statements and +expressions. These functions are meant to be used by XS code invoked +during Perl parsing, in a recursive-descent manner, to allow modules to +augment the standard Perl syntax. + +=over + +=item * + +L<parse_stmtseq()|perlapi/parse_stmtseq> +parses a sequence of statements, up to closing brace or EOF. + +=item * + +L<parse_fullstmt()|perlapi/parse_fullstmt> +parses a complete Perl statement, including optional label. + +=item * + +L<parse_barestmt()|perlapi/parse_barestmt> +parses a statement without a label. + +=item * + +L<parse_block()|perlapi/parse_block> +parses a code block. + +=item * + +L<parse_label()|perlapi/parse_label> +parses a statement label, separate from statements. + +=item * + +L<C<parse_fullexpr()>|perlapi/parse_fullexpr>, +L<C<parse_listexpr()>|perlapi/parse_listexpr>, +L<C<parse_termexpr()>|perlapi/parse_termexpr>, and +L<C<parse_arithexpr()>|perlapi/parse_arithexpr> +parse expressions at various precedence levels. + +=back + +=head3 Hints hash API + +A new C API for introspecting the hinthash C<%^H> at runtime has been +added. See C<cop_hints_2hv>, C<cop_hints_fetchpvn>, C<cop_hints_fetchpvs>, +C<cop_hints_fetchsv>, and C<hv_copy_hints_hv> in L<perlapi> for details. + +A new, experimental API has been added for accessing the internal +structure that Perl uses for C<%^H>. See the functions beginning with +C<cophh_> in L<perlapi>. + +=head3 C interface to caller() + +The C<caller_cx> function has been added as an XSUB-writer's equivalent of +caller(). See L<perlapi> for details. + +=head3 Custom per-subroutine check hooks + +XS code in an extension module can now annotate a subroutine (whether +implemented in XS or in Perl) so that nominated XS code will be called +at compile time (specifically as part of op checking) to change the op +tree of that subroutine. The compile-time check function (supplied by +the extension module) can implement argument processing that can't be +expressed as a prototype, generate customised compile-time warnings, +perform constant folding for a pure function, inline a subroutine +consisting of sufficiently simple ops, replace the whole call with a +custom op, and so on. This was previously all possible by hooking the +C<entersub> op checker, but the new mechanism makes it easy to tie the +hook to a specific subroutine. See L<perlapi/cv_set_call_checker>. + +To help in writing custom check hooks, several subtasks within standard +C<entersub> op checking have been separated out and exposed in the API. + +=head3 Improved support for custom OPs + +Custom ops can now be registered with the new C<custom_op_register> C +function and the C<XOP> structure. This will make it easier to add new +properties of custom ops in the future. Two new properties have been added +already, C<xop_class> and C<xop_peep>. + +C<xop_class> is one of the OA_*OP constants. It allows L<B> and other +introspection mechanisms to work with custom ops +that aren't BASEOPs. C<xop_peep> is a pointer to +a function that will be called for ops of this +type from C<Perl_rpeep>. + +See L<perlguts/Custom Operators> and L<perlapi/Custom Operators> for more +detail. + +The old C<PL_custom_op_names>/C<PL_custom_op_descs> interface is still +supported but discouraged. + +=head3 Scope hooks + +It is now possible for XS code to hook into Perl's lexical scope +mechanism at compile time, using the new C<Perl_blockhook_register> +function. See L<perlguts/"Compile-time scope hooks">. + +=head3 The recursive part of the peephole optimizer is now hookable + +In addition to C<PL_peepp>, for hooking into the toplevel peephole optimizer, a +C<PL_rpeepp> is now available to hook into the optimizer recursing into +side-chains of the optree. + +=head3 New non-magical variants of existing functions + +The following functions/macros have been added to the API. The C<*_nomg> +macros are equivalent to their non-C<_nomg> variants, except that they ignore +get-magic. Those ending in C<_flags> allow one to specify whether +get-magic is processed. + + sv_2bool_flags + SvTRUE_nomg + sv_2nv_flags + SvNV_nomg + sv_cmp_flags + sv_cmp_locale_flags + sv_eq_flags + sv_collxfrm_flags + +In some of these cases, the non-C<_flags> functions have +been replaced with wrappers around the new functions. + +=head3 pv/pvs/sv versions of existing functions + +Many functions ending with pvn now have equivalent C<pv/pvs/sv> versions. + +=head3 List op-building functions + +List op-building functions have been added to the +API. See L<op_append_elem|perlapi/op_append_elem>, +L<op_append_list|perlapi/op_append_list>, and +L<op_prepend_elem|perlapi/op_prepend_elem> in L<perlapi>. + +=head3 C<LINKLIST> + +The L<LINKLIST|perlapi/LINKLIST> macro, part of op building that +constructs the execution-order op chain, has been added to the API. + +=head3 Localisation functions + +The C<save_freeop>, C<save_op>, C<save_pushi32ptr> and C<save_pushptrptr> +functions have been added to the API. + +=head3 Stash names + +A stash can now have a list of effective names in addition to its usual +name. The first effective name can be accessed via the C<HvENAME> macro, +which is now the recommended name to use in MRO linearisations (C<HvNAME> +being a fallback if there is no C<HvENAME>). + +These names are added and deleted via C<hv_ename_add> and +C<hv_ename_delete>. These two functions are I<not> part of the API. + +=head3 New functions for finding and removing magic + +The L<C<mg_findext()>|perlapi/mg_findext> and +L<C<sv_unmagicext()>|perlapi/sv_unmagicext> +functions have been added to the API. +They allow extension authors to find and remove magic attached to +scalars based on both the magic type and the magic virtual table, similar to how +sv_magicext() attaches magic of a certain type and with a given virtual table +to a scalar. This eliminates the need for extensions to walk the list of +C<MAGIC> pointers of an C<SV> to find the magic that belongs to them. + +=head3 C<find_rundefsv> + +This function returns the SV representing C<$_>, whether it's lexical +or dynamic. + +=head3 C<Perl_croak_no_modify> + +Perl_croak_no_modify() is short-hand for +C<Perl_croak("%s", PL_no_modify)>. + +=head3 C<PERL_STATIC_INLINE> define + +The C<PERL_STATIC_INLINE> define has been added to provide the best-guess +incantation to use for static inline functions, if the C compiler supports +C99-style static inline. If it doesn't, it'll give a plain C<static>. + +C<HAS_STATIC_INLINE> can be used to check if the compiler actually supports +inline functions. + +=head3 New C<pv_escape> option for hexadecimal escapes + +A new option, C<PERL_PV_ESCAPE_NONASCII>, has been added to C<pv_escape> to +dump all characters above ASCII in hexadecimal. Before, one could get all +characters as hexadecimal or the Latin1 non-ASCII as octal. + +=head3 C<lex_start> + +C<lex_start> has been added to the API, but is considered experimental. + +=head3 op_scope() and op_lvalue() + +The op_scope() and op_lvalue() functions have been added to the API, +but are considered experimental. + +=head2 C API Changes + +=head3 C<PERL_POLLUTE> has been removed + +The option to define C<PERL_POLLUTE> to expose older 5.005 symbols for +backwards compatibility has been removed. Its use was always discouraged, +and MakeMaker contains a more specific escape hatch: + + perl Makefile.PL POLLUTE=1 + +This can be used for modules that have not been upgraded to 5.6 naming +conventions (and really should be completely obsolete by now). + +=head3 Check API compatibility when loading XS modules + +When Perl's API changes in incompatible ways (which usually happens between +major releases), XS modules compiled for previous versions of Perl will no +longer work. They need to be recompiled against the new Perl. + +The C<XS_APIVERSION_BOOTCHECK> macro has been added to ensure that modules +are recompiled and to prevent users from accidentally loading modules +compiled for old perls into newer perls. That macro, which is called when +loading every newly compiled extension, compares the API version of the +running perl with the version a module has been compiled for and raises an +exception if they don't match. + +=head3 Perl_fetch_cop_label + +The first argument of the C API function C<Perl_fetch_cop_label> has changed +from C<struct refcounted_he *> to C<COP *>, to insulate the user from +implementation details. + +This API function was marked as "may change", and likely isn't in use outside +the core. (Neither an unpacked CPAN nor Google's codesearch finds any other +references to it.) + +=head3 GvCV() and GvGP() are no longer lvalues + +The new GvCV_set() and GvGP_set() macros are now provided to replace +assignment to those two macros. + +This allows a future commit to eliminate some backref magic between GV +and CVs, which will require complete control over assignment to the +C<gp_cv> slot. + +=head3 CvGV() is no longer an lvalue + +Under some circumstances, the CvGV() field of a CV is now +reference-counted. To ensure consistent behaviour, direct assignment to +it, for example C<CvGV(cv) = gv> is now a compile-time error. A new macro, +C<CvGV_set(cv,gv)> has been introduced to run this operation +safely. Note that modification of this field is not part of the public +API, regardless of this new macro (and despite its being listed in this section). + +=head3 CvSTASH() is no longer an lvalue + +The CvSTASH() macro can now only be used as an rvalue. CvSTASH_set() +has been added to replace assignment to CvSTASH(). This is to ensure +that backreferences are handled properly. These macros are not part of the +API. + +=head3 Calling conventions for C<newFOROP> and C<newWHILEOP> + +The way the parser handles labels has been cleaned up and refactored. As a +result, the newFOROP() constructor function no longer takes a parameter +stating what label is to go in the state op. + +The newWHILEOP() and newFOROP() functions no longer accept a line +number as a parameter. + +=head3 Flags passed to C<uvuni_to_utf8_flags> and C<utf8n_to_uvuni> + +Some of the flags parameters to uvuni_to_utf8_flags() and +utf8n_to_uvuni() have changed. This is a result of Perl's now allowing +internal storage and manipulation of code points that are problematic +in some situations. Hence, the default actions for these functions has +been complemented to allow these code points. The new flags are +documented in L<perlapi>. Code that requires the problematic code +points to be rejected needs to change to use the new flags. Some flag +names are retained for backward source compatibility, though they do +nothing, as they are now the default. However the flags +C<UNICODE_ALLOW_FDD0>, C<UNICODE_ALLOW_FFFF>, C<UNICODE_ILLEGAL>, and +C<UNICODE_IS_ILLEGAL> have been removed, as they stem from a +fundamentally broken model of how the Unicode non-character code points +should be handled, which is now described in +L<perlunicode/Non-character code points>. See also the Unicode section +under L</Selected Bug Fixes>. + +=head2 Deprecated C APIs + +=over + +=item C<Perl_ptr_table_clear> + +C<Perl_ptr_table_clear> is no longer part of Perl's public API. Calling it +now generates a deprecation warning, and it will be removed in a future +release. + +=item C<sv_compile_2op> + +The sv_compile_2op() API function is now deprecated. Searches suggest +that nothing on CPAN is using it, so this should have zero impact. + +It attempted to provide an API to compile code down to an optree, but failed +to bind correctly to lexicals in the enclosing scope. It's not possible to +fix this problem within the constraints of its parameters and return value. + +=item C<find_rundefsvoffset> + +The C<find_rundefsvoffset> function has been deprecated. It appeared that +its design was insufficient for reliably getting the lexical C<$_> at +run-time. + +Use the new C<find_rundefsv> function or the C<UNDERBAR> macro +instead. They directly return the right SV +representing C<$_>, whether it's +lexical or dynamic. + +=item C<CALL_FPTR> and C<CPERLscope> + +Those are left from an old implementation of C<MULTIPLICITY> using C++ objects, +which was removed in Perl 5.8. Nowadays these macros do exactly nothing, so +they shouldn't be used anymore. + +For compatibility, they are still defined for external C<XS> code. Only +extensions defining C<PERL_CORE> must be updated now. + +=back + +=head2 Other Internal Changes + +=head3 Stack unwinding + +The protocol for unwinding the C stack at the last stage of a C<die> +has changed how it identifies the target stack frame. This now uses +a separate variable C<PL_restartjmpenv>, where previously it relied on +the C<blk_eval.cur_top_env> pointer in the C<eval> context frame that +has nominally just been discarded. This change means that code running +during various stages of Perl-level unwinding no longer needs to take +care to avoid destroying the ghost frame. + +=head3 Scope stack entries + +The format of entries on the scope stack has been changed, resulting in a +reduction of memory usage of about 10%. In particular, the memory used by +the scope stack to record each active lexical variable has been halved. + +=head3 Memory allocation for pointer tables + +Memory allocation for pointer tables has been changed. Previously +C<Perl_ptr_table_store> allocated memory from the same arena system as +C<SV> bodies and C<HE>s, with freed memory remaining bound to those arenas +until interpreter exit. Now it allocates memory from arenas private to the +specific pointer table, and that memory is returned to the system when +C<Perl_ptr_table_free> is called. Additionally, allocation and release are +both less CPU intensive. + +=head3 C<UNDERBAR> + +The C<UNDERBAR> macro now calls C<find_rundefsv>. C<dUNDERBAR> is now a +noop but should still be used to ensure past and future compatibility. + +=head3 String comparison routines renamed + +The C<ibcmp_*> functions have been renamed and are now called C<foldEQ>, +C<foldEQ_locale>, and C<foldEQ_utf8>. The old names are still available as +macros. + +=head3 C<chop> and C<chomp> implementations merged + +The opcode bodies for C<chop> and C<chomp> and for C<schop> and C<schomp> +have been merged. The implementation functions Perl_do_chop() and +Perl_do_chomp(), never part of the public API, have been merged and +moved to a static function in F<pp.c>. This shrinks the Perl binary +slightly, and should not affect any code outside the core (unless it is +relying on the order of side-effects when C<chomp> is passed a I<list> of +values). + +=head1 Selected Bug Fixes + +=head2 I/O + +=over 4 + +=item * + +Perl no longer produces this warning: + + $ perl -we 'open(my $f, ">", \my $x); binmode($f, "scalar")' + Use of uninitialized value in binmode at -e line 1. + +=item * + +Opening a glob reference via C<< open($fh, ">", \*glob) >> no longer +causes the glob to be corrupted when the filehandle is printed to. This would +cause Perl to crash whenever the glob's contents were accessed +[perl #77492]. + +=item * + +PerlIO no longer crashes when called recursively, such as from a signal +handler. Now it just leaks memory [perl #75556]. + +=item * + +Most I/O functions were not warning for unopened handles unless the +"closed" and "unopened" warnings categories were both enabled. Now only +C<use warnings 'unopened'> is necessary to trigger these warnings, as +had always been the intention. + +=item * + +There have been several fixes to PerlIO layers: + +When C<binmode(FH, ":crlf")> pushes the C<:crlf> layer on top of the stack, +it no longer enables crlf layers lower in the stack so as to avoid +unexpected results [perl #38456]. + +Opening a file in C<:raw> mode now does what it advertises to do (first +open the file, then C<binmode> it), instead of simply leaving off the top +layer [perl #80764]. + +The three layers C<:pop>, C<:utf8>, and C<:bytes> didn't allow stacking when +opening a file. For example +this: + + open(FH, ">:pop:perlio", "some.file") or die $!; + +would throw an "Invalid argument" error. This has been fixed in this +release [perl #82484]. + +=back + +=head2 Regular Expression Bug Fixes + +=over + +=item * + +The regular expression engine no longer loops when matching +C<"\N{LATIN SMALL LIGATURE FF}" =~ /f+/i> and similar expressions +[perl #72998] (5.12.1). + +=item * + +The trie runtime code should no longer allocate massive amounts of memory, +fixing #74484. + +=item * + +Syntax errors in C<< (?{...}) >> blocks no longer cause panic messages +[perl #2353]. + +=item * + +A pattern like C<(?:(o){2})?> no longer causes a "panic" error +[perl #39233]. + +=item * + +A fatal error in regular expressions containing C<(.*?)> when processing +UTF-8 data has been fixed [perl #75680] (5.12.2). + +=item * + +An erroneous regular expression engine optimisation that caused regex verbs like +C<*COMMIT> sometimes to be ignored has been removed. + +=item * + +The regular expression bracketed character class C<[\8\9]> was effectively the +same as C<[89\000]>, incorrectly matching a NULL character. It also gave +incorrect warnings that the C<8> and C<9> were ignored. Now C<[\8\9]> is the +same as C<[89]> and gives legitimate warnings that C<\8> and C<\9> are +unrecognized escape sequences, passed-through. + +=item * + +A regular expression match in the right-hand side of a global substitution +(C<s///g>) that is in the same scope will no longer cause match variables +to have the wrong values on subsequent iterations. This can happen when an +array or hash subscript is interpolated in the right-hand side, as in +C<s|(.)|@a{ print($1), /./ }|g> [perl #19078]. + +=item * + +Several cases in which characters in the Latin-1 non-ASCII range (0x80 to +0xFF) used not to match themselves, or used to match both a character class +and its complement, have been fixed. For instance, U+00E2 could match both +C<\w> and C<\W> [perl #78464] [perl #18281] [perl #60156]. + +=item * + +Matching a Unicode character against an alternation containing characters +that happened to match continuation bytes in the former's UTF8 +representation (like C<qq{\x{30ab}} =~ /\xab|\xa9/>) would cause erroneous +warnings [perl #70998]. + +=item * + +The trie optimisation was not taking empty groups into account, preventing +"foo" from matching C</\A(?:(?:)foo|bar|zot)\z/> [perl #78356]. + +=item * + +A pattern containing a C<+> inside a lookahead would sometimes cause an +incorrect match failure in a global match (for example, C</(?=(\S+))/g>) +[perl #68564]. + +=item * + +A regular expression optimisation would sometimes cause a match with a +C<{n,m}> quantifier to fail when it should have matched [perl #79152]. + +=item * + +Case-insensitive matching in regular expressions compiled under +C<use locale> now works much more sanely when the pattern or target +string is internally encoded in UTF8. Previously, under these +conditions the localeness was completely lost. Now, code points +above 255 are treated as Unicode, but code points between 0 and 255 +are treated using the current locale rules, regardless of whether +the pattern or the string is encoded in UTF8. The few case-insensitive +matches that cross the 255/256 boundary are not allowed. For +example, 0xFF does not caselessly match the character at 0x178, +LATIN CAPITAL LETTER Y WITH DIAERESIS, because 0xFF may not be LATIN +SMALL LETTER Y in the current locale, and Perl has no way of knowing +if that character even exists in the locale, much less what code +point it is. + +=item * + +The C<(?|...)> regular expression construct no longer crashes if the final +branch has more sets of capturing parentheses than any other branch. This +was fixed in Perl 5.10.1 for the case of a single branch, but that fix did +not take multiple branches into account [perl #84746]. + +=item * + +A bug has been fixed in the implementation of C<{...}> quantifiers in +regular expressions that prevented the code block in +C</((\w+)(?{ print $2 })){2}/> from seeing the C<$2> sometimes +[perl #84294]. + +=back + +=head2 Syntax/Parsing Bugs + +=over + +=item * + +C<when (scalar) {...}> no longer crashes, but produces a syntax error +[perl #74114] (5.12.1). + +=item * + +A label right before a string eval (C<foo: eval $string>) no longer causes +the label to be associated also with the first statement inside the eval +[perl #74290] (5.12.1). + +=item * + +The C<no 5.13.2> form of C<no> no longer tries to turn on features or +pragmata (like L<strict>) [perl #70075] (5.12.2). + +=item * + +C<BEGIN {require 5.12.0}> now behaves as documented, rather than behaving +identically to C<use 5.12.0>. Previously, C<require> in a C<BEGIN> block +was erroneously executing the C<use feature ':5.12.0'> and +C<use strict> behaviour, which only C<use> was documented to +provide [perl #69050]. + +=item * + +A regression introduced in Perl 5.12.0, making +C<< my $x = 3; $x = length(undef) >> result in C<$x> set to C<3> has been +fixed. C<$x> will now be C<undef> [perl #85508] (5.12.2). + +=item * + +When strict "refs" mode is off, C<%{...}> in rvalue context returns +C<undef> if its argument is undefined. An optimisation introduced in Perl +5.12.0 to make C<keys %{...}> faster when used as a boolean did not take +this into account, causing C<keys %{+undef}> (and C<keys %$foo> when +C<$foo> is undefined) to be an error, which it should be so in strict +mode only [perl #81750]. + +=item * + +Constant-folding used to cause + + $text =~ ( 1 ? /phoo/ : /bear/) + +to turn into + + $text =~ /phoo/ + +at compile time. Now it correctly matches against C<$_> [perl #20444]. + +=item * + +Parsing Perl code (either with string C<eval> or by loading modules) from +within a C<UNITCHECK> block no longer causes the interpreter to crash +[perl #70614]. + +=item * + +String C<eval>s no longer fail after 2 billion scopes have been +compiled [perl #83364]. + +=item * + +The parser no longer hangs when encountering certain Unicode characters, +such as U+387 [perl #74022]. + +=item * + +Defining a constant with the same name as one of Perl's special blocks +(like C<INIT>) stopped working in 5.12.0, but has now been fixed +[perl #78634]. + +=item * + +A reference to a literal value used as a hash key (C<$hash{\"foo"}>) used +to be stringified, even if the hash was tied [perl #79178]. + +=item * + +A closure containing an C<if> statement followed by a constant or variable +is no longer treated as a constant [perl #63540]. + +=item * + +C<state> can now be used with attributes. It +used to mean the same thing as +C<my> if any attributes were present [perl #68658]. + +=item * + +Expressions like C<< @$a > 3 >> no longer cause C<$a> to be mentioned in +the "Use of uninitialized value in numeric gt" warning when C<$a> is +undefined (since it is not part of the C<< > >> expression, but the operand +of the C<@>) [perl #72090]. + +=item * + +Accessing an element of a package array with a hard-coded number (as +opposed to an arbitrary expression) would crash if the array did not exist. +Usually the array would be autovivified during compilation, but typeglob +manipulation could remove it, as in these two cases which used to crash: + + *d = *a; print $d[0]; + undef *d; print $d[0]; + +=item * + +The B<-C> command-line option, when used on the shebang line, can now be +followed by other options [perl #72434]. + +=item * + +The C<B> module was returning C<B::OP>s instead of C<B::LOGOP>s for +C<entertry> [perl #80622]. This was due to a bug in the Perl core, +not in C<B> itself. + +=back + +=head2 Stashes, Globs and Method Lookup + +Perl 5.10.0 introduced a new internal mechanism for caching MROs (method +resolution orders, or lists of parent classes; aka "isa" caches) to make +method lookup faster (so C<@ISA> arrays would not have to be searched +repeatedly). Unfortunately, this brought with it quite a few bugs. Almost +all of these have been fixed now, along with a few MRO-related bugs that +existed before 5.10.0: + +=over + +=item * + +The following used to have erratic effects on method resolution, because +the "isa" caches were not reset or otherwise ended up listing the wrong +classes. These have been fixed. + +=over + +=item Aliasing packages by assigning to globs [perl #77358] + +=item Deleting packages by deleting their containing stash elements + +=item Undefining the glob containing a package (C<undef *Foo::>) + +=item Undefining an ISA glob (C<undef *Foo::ISA>) + +=item Deleting an ISA stash element (C<delete $Foo::{ISA}>) + +=item Sharing @ISA arrays between classes (via C<*Foo::ISA = \@Bar::ISA> or +C<*Foo::ISA = *Bar::ISA>) [perl #77238] + +=back + +C<undef *Foo::ISA> would even stop a new C<@Foo::ISA> array from updating +caches. + +=item * + +Typeglob assignments would crash if the glob's stash no longer existed, so +long as the glob assigned to were named C<ISA> or the glob on either side of +the assignment contained a subroutine. + +=item * + +C<PL_isarev>, which is accessible to Perl via C<mro::get_isarev> is now +updated properly when packages are deleted or removed from the C<@ISA> of +other classes. This allows many packages to be created and deleted without +causing a memory leak [perl #75176]. + +=back + +In addition, various other bugs related to typeglobs and stashes have been +fixed: + +=over + +=item * + +Some work has been done on the internal pointers that link between symbol +tables (stashes), typeglobs, and subroutines. This has the effect that +various edge cases related to deleting stashes or stash entries (for example, +<%FOO:: = ()>), and complex typeglob or code-reference aliasing, will no +longer crash the interpreter. + +=item * + +Assigning a reference to a glob copy now assigns to a glob slot instead of +overwriting the glob with a scalar [perl #1804] [perl #77508]. + +=item * + +A bug when replacing the glob of a loop variable within the loop has been fixed +[perl #21469]. This +means the following code will no longer crash: + + for $x (...) { + *x = *y; + } + +=item * + +Assigning a glob to a PVLV used to convert it to a plain string. Now it +works correctly, and a PVLV can hold a glob. This would happen when a +nonexistent hash or array element was passed to a subroutine: + + sub { $_[0] = *foo }->($hash{key}); + # $_[0] would have been the string "*main::foo" + +It also happened when a glob was assigned to, or returned from, an element +of a tied array or hash [perl #36051]. + +=item * + +When trying to report C<Use of uninitialized value $Foo::BAR>, crashes could +occur if the glob holding the global variable in question had been detached +from its original stash by, for example, C<delete $::{"Foo::"}>. This has +been fixed by disabling the reporting of variable names in those +cases. + +=item * + +During the restoration of a localised typeglob on scope exit, any +destructors called as a result would be able to see the typeglob in an +inconsistent state, containing freed entries, which could result in a +crash. This would affect code like this: + + local *@; + eval { die bless [] }; # puts an object in $@ + sub DESTROY { + local $@; # boom + } + +Now the glob entries are cleared before any destructors are called. This +also means that destructors can vivify entries in the glob. So Perl tries +again and, if the entries are re-created too many times, dies with a +"panic: gp_free ..." error message. + +=item * + +If a typeglob is freed while a subroutine attached to it is still +referenced elsewhere, the subroutine is renamed to C<__ANON__> in the same +package, unless the package has been undefined, in which case the C<__ANON__> +package is used. This could cause packages to be sometimes autovivified, +such as if the package had been deleted. Now this no longer occurs. +The C<__ANON__> package is also now used when the original package is +no longer attached to the symbol table. This avoids memory leaks in some +cases [perl #87664]. + +=item * + +Subroutines and package variables inside a package whose name ends with +C<::> can now be accessed with a fully qualified name. + +=back + +=head2 Unicode + +=over + +=item * + +What has become known as "the Unicode Bug" is almost completely resolved in +this release. Under C<use feature 'unicode_strings'> (which is +automatically selected by C<use 5.012> and above), the internal +storage format of a string no longer affects the external semantics. +[perl #58182]. + +There are two known exceptions: + +=over + +=item 1 + +The now-deprecated, user-defined case-changing +functions require utf8-encoded strings to operate. The CPAN module +L<Unicode::Casing> has been written to replace this feature without its +drawbacks, and the feature is scheduled to be removed in 5.16. + +=item 2 + +quotemeta() (and its in-line equivalent C<\Q>) can also give different +results depending on whether a string is encoded in UTF-8. See +L<perlunicode/The "Unicode Bug">. + +=back + +=item * + +Handling of Unicode non-character code points has changed. +Previously they were mostly considered illegal, except that in some +place only one of the 66 of them was known. The Unicode Standard +considers them all legal, but forbids their "open interchange". +This is part of the change to allow internal use of any code +point (see L</Core Enhancements>). Together, these changes resolve +[perl #38722], [perl #51918], [perl #51936], and [perl #63446]. + +=item * + +Case-insensitive C<"/i"> regular expression matching of Unicode +characters that match multiple characters now works much more as +intended. For example + + "\N{LATIN SMALL LIGATURE FFI}" =~ /ffi/ui + +and + + "ffi" =~ /\N{LATIN SMALL LIGATURE FFI}/ui + +are both true. Previously, there were many bugs with this feature. +What hasn't been fixed are the places where the pattern contains the +multiple characters, but the characters are split up by other things, +such as in + + "\N{LATIN SMALL LIGATURE FFI}" =~ /(f)(f)i/ui + +or + + "\N{LATIN SMALL LIGATURE FFI}" =~ /ffi*/ui + +or + + "\N{LATIN SMALL LIGATURE FFI}" =~ /[a-f][f-m][g-z]/ui + +None of these match. + +Also, this matching doesn't fully conform to the current Unicode +Standard, which asks that the matching be made upon the NFD +(Normalization Form Decomposed) of the text. However, as of this +writing (April 2010), the Unicode Standard is currently in flux about +what they will recommend doing with regard in such scenarios. It may be +that they will throw out the whole concept of multi-character matches. +[perl #71736]. + +=item * + +Naming a deprecated character in C<\N{I<NAME>}> no longer leaks memory. + +=item * + +We fixed a bug that could cause C<\N{I<NAME>}> constructs followed by +a single C<"."> to be parsed incorrectly [perl #74978] (5.12.1). + +=item * + +C<chop> now correctly handles characters above C<"\x{7fffffff}"> +[perl #73246]. + +=item * + +Passing to C<index> an offset beyond the end of the string when the string +is encoded internally in UTF8 no longer causes panics [perl #75898]. + +=item * + +warn() and die() now respect utf8-encoded scalars [perl #45549]. + +=item * + +Sometimes the UTF8 length cache would not be reset on a value +returned by substr, causing C<length(substr($uni_string, ...))> to give +wrong answers. With C<${^UTF8CACHE}> set to -1, it would also produce +a "panic" error message [perl #77692]. + +=back + +=head2 Ties, Overloading and Other Magic + +=over + +=item * + +Overloading now works properly in conjunction with tied +variables. What formerly happened was that most ops checked their +arguments for overloading I<before> checking for magic, so for example +an overloaded object returned by a tied array access would usually be +treated as not overloaded [RT #57012]. + +=item * + +Various instances of magic (like tie methods) being called on tied variables +too many or too few times have been fixed: + +=over + +=item * + +C<< $tied->() >> did not always call FETCH [perl #8438]. + +=item * + +Filetest operators and C<y///> and C<tr///> were calling FETCH too +many times. + +=item * + +The C<=> operator used to ignore magic on its right-hand side if the +scalar happened to hold a typeglob (if a typeglob was the last thing +returned from or assigned to a tied scalar) [perl #77498]. + +=item * + +Dereference operators used to ignore magic if the argument was a +reference already (such as from a previous FETCH) [perl #72144]. + +=item * + +C<splice> now calls set-magic (so changes made +by C<splice @ISA> are respected by method calls) [perl #78400]. + +=item * + +In-memory files created by C<< open($fh, ">", \$buffer) >> were not calling +FETCH/STORE at all [perl #43789] (5.12.2). + +=item * + +utf8::is_utf8() now respects get-magic (like C<$1>) (5.12.1). + +=back + +=item * + +Non-commutative binary operators used to swap their operands if the same +tied scalar was used for both operands and returned a different value for +each FETCH. For instance, if C<$t> returned 2 the first time and 3 the +second, then C<$t/$t> would evaluate to 1.5. This has been fixed +[perl #87708]. + +=item * + +String C<eval> now detects taintedness of overloaded or tied +arguments [perl #75716]. + +=item * + +String C<eval> and regular expression matches against objects with string +overloading no longer cause memory corruption or crashes [perl #77084]. + +=item * + +L<readline|perlfunc/"readline EXPR"> now honors C<< <> >> overloading on tied +arguments. + +=item * + +C<< <expr> >> always respects overloading now if the expression is +overloaded. + +Because "S<< <> as >> glob" was parsed differently from +"S<< <> as >> filehandle" from 5.6 onwards, something like C<< <$foo[0]> >> did +not handle overloading, even if C<$foo[0]> was an overloaded object. This +was contrary to the documentation for L<overload>, and meant that C<< <> >> +could not be used as a general overloaded iterator operator. + +=item * + +The fallback behaviour of overloading on binary operators was asymmetric +[perl #71286]. + +=item * + +Magic applied to variables in the main package no longer affects other packages. +See L</Magic variables outside the main package> above [perl #76138]. + +=item * + +Sometimes magic (ties, taintedness, etc.) attached to variables could cause +an object to last longer than it should, or cause a crash if a tied +variable were freed from within a tie method. These have been fixed +[perl #81230]. + +=item * + +DESTROY methods of objects implementing ties are no longer able to crash by +accessing the tied variable through a weak reference [perl #86328]. + +=item * + +Fixed a regression of kill() when a match variable is used for the +process ID to kill [perl #75812]. + +=item * + +C<$AUTOLOAD> used to remain tainted forever if it ever became tainted. Now +it is correctly untainted if an autoloaded method is called and the method +name was not tainted. + +=item * + +C<sprintf> now dies when passed a tainted scalar for the format. It did +already die for arbitrary expressions, but not for simple scalars +[perl #82250]. + +=item * + +C<lc>, C<uc>, C<lcfirst>, and C<ucfirst> no longer return untainted strings +when the argument is tainted. This has been broken since perl 5.8.9 +[perl #87336]. + +=back + +=head2 The Debugger + +=over + +=item * + +The Perl debugger now also works in taint mode [perl #76872]. + +=item * + +Subroutine redefinition works once more in the debugger [perl #48332]. + +=item * + +When B<-d> is used on the shebang (C<#!>) line, the debugger now has access +to the lines of the main program. In the past, this sometimes worked and +sometimes did not, depending on the order in which things happened to be +arranged in memory [perl #71806]. + +=item * + +A possible memory leak when using L<caller()|perlfunc/"caller EXPR"> to set +C<@DB::args> has been fixed (5.12.2). + +=item * + +Perl no longer stomps on C<$DB::single>, C<$DB::trace>, and C<$DB::signal> +if these variables already have values when C<$^P> is assigned to [perl #72422]. + +=item * + +C<#line> directives in string evals were not properly updating the arrays +of lines of code (C<< @{"_< ..."} >>) that the debugger (or any debugging or +profiling module) uses. In threaded builds, they were not being updated at +all. In non-threaded builds, the line number was ignored, so any change to +the existing line number would cause the lines to be misnumbered +[perl #79442]. + +=back + +=head2 Threads + +=over + +=item * + +Perl no longer accidentally clones lexicals in scope within active stack +frames in the parent when creating a child thread [perl #73086]. + +=item * + +Several memory leaks in cloning and freeing threaded Perl interpreters have been +fixed [perl #77352]. + +=item * + +Creating a new thread when directory handles were open used to cause a +crash, because the handles were not cloned, but simply passed to the new +thread, resulting in a double free. + +Now directory handles are cloned properly on Windows +and on systems that have a C<fchdir> function. On other +systems, new threads simply do not inherit directory +handles from their parent threads [perl #75154]. + +=item * + +The typeglob C<*,>, which holds the scalar variable C<$,> (output field +separator), had the wrong reference count in child threads. + +=item * + +[perl #78494] When pipes are shared between threads, the C<close> function +(and any implicit close, such as on thread exit) no longer blocks. + +=item * + +Perl now does a timely cleanup of SVs that are cloned into a new +thread but then discovered to be orphaned (that is, their owners +are I<not> cloned). This eliminates several "scalars leaked" +warnings when joining threads. + +=back + +=head2 Scoping and Subroutines + +=over + +=item * + +Lvalue subroutines are again able to return copy-on-write scalars. This +had been broken since version 5.10.0 [perl #75656] (5.12.3). + +=item * + +C<require> no longer causes C<caller> to return the wrong file name for +the scope that called C<require> and other scopes higher up that had the +same file name [perl #68712]. + +=item * + +C<sort> with a C<($$)>-prototyped comparison routine used to cause the value +of C<@_> to leak out of the sort. Taking a reference to C<@_> within the +sorting routine could cause a crash [perl #72334]. + +=item * + +Match variables (like C<$1>) no longer persist between calls to a sort +subroutine [perl #76026]. + +=item * + +Iterating with C<foreach> over an array returned by an lvalue sub now works +[perl #23790]. + +=item * + +C<$@> is now localised during calls to C<binmode> to prevent action at a +distance [perl #78844]. + +=item * + +Calling a closure prototype (what is passed to an attribute handler for a +closure) now results in a "Closure prototype called" error message instead +of a crash [perl #68560]. + +=item * + +Mentioning a read-only lexical variable from the enclosing scope in a +string C<eval> no longer causes the variable to become writable +[perl #19135]. + +=back + +=head2 Signals + +=over + +=item * + +Within signal handlers, C<$!> is now implicitly localized. + +=item * + +CHLD signals are no longer unblocked after a signal handler is called if +they were blocked before by C<POSIX::sigprocmask> [perl #82040]. + +=item * + +A signal handler called within a signal handler could cause leaks or +double-frees. Now fixed [perl #76248]. + +=back + +=head2 Miscellaneous Memory Leaks + +=over + +=item * + +Several memory leaks when loading XS modules were fixed (5.12.2). + +=item * + +L<substr()|perlfunc/"substr EXPR,OFFSET,LENGTH,REPLACEMENT">, +L<pos()|perlfunc/"index STR,SUBSTR,POSITION">, L<keys()|perlfunc/"keys HASH">, +and L<vec()|perlfunc/"vec EXPR,OFFSET,BITS"> could, when used in combination +with lvalues, result in leaking the scalar value they operate on, and cause its +destruction to happen too late. This has now been fixed. + +=item * + +The postincrement and postdecrement operators, C<++> and C<-->, used to cause +leaks when used on references. This has now been fixed. + +=item * + +Nested C<map> and C<grep> blocks no longer leak memory when processing +large lists [perl #48004]. + +=item * + +C<use I<VERSION>> and C<no I<VERSION>> no longer leak memory [perl #78436] +[perl #69050]. + +=item * + +C<.=> followed by C<< <> >> or C<readline> would leak memory if C<$/> +contained characters beyond the octet range and the scalar assigned to +happened to be encoded as UTF8 internally [perl #72246]. + +=item * + +C<eval 'BEGIN{die}'> no longer leaks memory on non-threaded builds. + +=back + +=head2 Memory Corruption and Crashes + +=over + +=item * + +glob() no longer crashes when C<%File::Glob::> is empty and +C<CORE::GLOBAL::glob> isn't present [perl #75464] (5.12.2). + +=item * + +readline() has been fixed when interrupted by signals so it no longer +returns the "same thing" as before or random memory. + +=item * + +When assigning a list with duplicated keys to a hash, the assignment used to +return garbage and/or freed values: + + @a = %h = (list with some duplicate keys); + +This has now been fixed [perl #31865]. + +=item * + +The mechanism for freeing objects in globs used to leave dangling +pointers to freed SVs, meaning Perl users could see corrupted state +during destruction. + +Perl now frees only the affected slots of the GV, rather than freeing +the GV itself. This makes sure that there are no dangling refs or +corrupted state during destruction. + +=item * + +The interpreter no longer crashes when freeing deeply-nested arrays of +arrays. Hashes have not been fixed yet [perl #44225]. + +=item * + +Concatenating long strings under C<use encoding> no longer causes Perl to +crash [perl #78674]. + +=item * + +Calling C<< ->import >> on a class lacking an import method could corrupt +the stack, resulting in strange behaviour. For instance, + + push @a, "foo", $b = bar->import; + +would assign "foo" to C<$b> [perl #63790]. + +=item * + +The C<recv> function could crash when called with the MSG_TRUNC flag +[perl #75082]. + +=item * + +C<formline> no longer crashes when passed a tainted format picture. It also +taints C<$^A> now if its arguments are tainted [perl #79138]. + +=item * + +A bug in how we process filetest operations could cause a segfault. +Filetests don't always expect an op on the stack, so we now use +TOPs only if we're sure that we're not C<stat>ing the C<_> filehandle. +This is indicated by C<OPf_KIDS> (as checked in ck_ftst) [perl #74542] +(5.12.1). + +=item * + +unpack() now handles scalar context correctly for C<%32H> and C<%32u>, +fixing a potential crash. split() would crash because the third item +on the stack wasn't the regular expression it expected. C<unpack("%2H", +...)> would return both the unpacked result and the checksum on the stack, +as would C<unpack("%2u", ...)> [perl #73814] (5.12.2). + +=back + +=head2 Fixes to Various Perl Operators + +=over + +=item * + +The C<&>, C<|>, and C<^> bitwise operators no longer coerce read-only arguments +[perl #20661]. + +=item * + +Stringifying a scalar containing "-0.0" no longer has the effect of turning +false into true [perl #45133]. + +=item * + +Some numeric operators were converting integers to floating point, +resulting in loss of precision on 64-bit platforms [perl #77456]. + +=item * + +sprintf() was ignoring locales when called with constant arguments +[perl #78632]. + +=item * + +Combining the vector (C<%v>) flag and dynamic precision would +cause C<sprintf> to confuse the order of its arguments, making it +treat the string as the precision and vice-versa [perl #83194]. + +=back + +=head2 Bugs Relating to the C API + +=over + +=item * + +The C-level C<lex_stuff_pvn> function would sometimes cause a spurious +syntax error on the last line of the file if it lacked a final semicolon +[perl #74006] (5.12.1). + +=item * + +The C<eval_sv> and C<eval_pv> C functions now set C<$@> correctly when +there is a syntax error and no C<G_KEEPERR> flag, and never set it if the +C<G_KEEPERR> flag is present [perl #3719]. + +=item * + +The XS multicall API no longer causes subroutines to lose reference counts +if called via the multicall interface from within those very subroutines. +This affects modules like L<List::Util>. Calling one of its functions with an +active subroutine as the first argument could cause a crash [perl #78070]. + +=item * + +The C<SvPVbyte> function available to XS modules now calls magic before +downgrading the SV, to avoid warnings about wide characters [perl #72398]. + +=item * + +The ref types in the typemap for XS bindings now support magical variables +[perl #72684]. + +=item * + +C<sv_catsv_flags> no longer calls C<mg_get> on its second argument (the +source string) if the flags passed to it do not include SV_GMAGIC. So it +now matches the documentation. + +=item * + +C<my_strftime> no longer leaks memory. This fixes a memory leak in +C<POSIX::strftime> [perl #73520]. + +=item * + +F<XSUB.h> now correctly redefines fgets under PERL_IMPLICIT_SYS [perl #55049] +(5.12.1). + +=item * + +XS code using fputc() or fputs() on Windows could cause an error +due to their arguments being swapped [perl #72704] (5.12.1). + +=item * + +A possible segfault in the C<T_PTROBJ> default typemap has been fixed +(5.12.2). + +=item * + +A bug that could cause "Unknown error" messages when +C<call_sv(code, G_EVAL)> is called from an XS destructor has been fixed +(5.12.2). + +=back + +=head1 Known Problems + +This is a list of significant unresolved issues which are regressions +from earlier versions of Perl or which affect widely-used CPAN modules. + +=over 4 + +=item * + +C<List::Util::first> misbehaves in the presence of a lexical C<$_> +(typically introduced by C<my $_> or implicitly by C<given>). The variable +that gets set for each iteration is the package variable C<$_>, not the +lexical C<$_>. + +A similar issue may occur in other modules that provide functions which +take a block as their first argument, like + + foo { ... $_ ...} list + +See also: L<http://rt.perl.org/rt3/Public/Bug/Display.html?id=67694> + +=item * + +readline() returns an empty string instead of a cached previous value +when it is interrupted by a signal + +=item * + +The changes in prototype handling break L<Switch>. A patch has been sent +upstream and will hopefully appear on CPAN soon. + +=item * + +The upgrade to F<ExtUtils-MakeMaker-6.57_05> has caused +some tests in the F<Module-Install> distribution on CPAN to +fail. (Specifically, F<02_mymeta.t> tests 5 and 21; F<18_all_from.t> +tests 6 and 15; F<19_authors.t> tests 5, 13, 21, and 29; and +F<20_authors_with_special_characters.t> tests 6, 15, and 23 in version +1.00 of that distribution now fail.) + +=item * + +On VMS, C<Time::HiRes> tests will fail due to a bug in the CRTL's +implementation of C<setitimer>: previous timer values would be cleared +if a timer expired but not if the timer was reset before expiring. HP +OpenVMS Engineering have corrected the problem and will release a patch +in due course (Quix case # QXCM1001115136). + +=item * + +On VMS, there were a handful of C<Module::Build> test failures we didn't +get to before the release; please watch CPAN for updates. + +=back + +=head1 Errata + +=head2 keys(), values(), and each() work on arrays + +You can now use the keys(), values(), and each() builtins on arrays; +previously you could use them only on hashes. See L<perlfunc> for details. +This is actually a change introduced in perl 5.12.0, but it was missed from +that release's L<perl5120delta>. + +=head2 split() and C<@_> + +split() no longer modifies C<@_> when called in scalar or void context. +In void context it now produces a "Useless use of split" warning. +This was also a perl 5.12.0 change that missed the perldelta. + +=head1 Obituary + +Randy Kobes, creator of http://kobesearch.cpan.org/ and +contributor/maintainer to several core Perl toolchain modules, passed +away on September 18, 2010 after a battle with lung cancer. The community +was richer for his involvement. He will be missed. + +=head1 Acknowledgements + +Perl 5.14.0 represents one year of development since +Perl 5.12.0 and contains nearly 550,000 lines of changes across nearly +3,000 files from 150 authors and committers. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.14.0: + +Aaron Crane, Abhijit Menon-Sen, Abigail, Ævar Arnfjörð Bjarmason, +Alastair Douglas, Alexander Alekseev, Alexander Hartmaier, Alexandr +Ciornii, Alex Davies, Alex Vandiver, Ali Polatel, Allen Smith, Andreas +König, Andrew Rodland, Andy Armstrong, Andy Dougherty, Aristotle +Pagaltzis, Arkturuz, Arvan, A. Sinan Unur, Ben Morrow, Bo Lindbergh, +Boris Ratner, Brad Gilbert, Bram, brian d foy, Brian Phillips, Casey +West, Charles Bailey, Chas. Owens, Chip Salzenberg, Chris 'BinGOs' +Williams, chromatic, Craig A. Berry, Curtis Jewell, Dagfinn Ilmari +Mannsåker, Dan Dascalescu, Dave Rolsky, David Caldwell, David Cantrell, +David Golden, David Leadbeater, David Mitchell, David Wheeler, Eric +Brine, Father Chrysostomos, Fingle Nark, Florian Ragwitz, Frank Wiegand, +Franz Fasching, Gene Sullivan, George Greer, Gerard Goossen, Gisle Aas, +Goro Fuji, Grant McLean, gregor herrmann, H.Merijn Brand, Hongwen Qiu, +Hugo van der Sanden, Ian Goodacre, James E Keenan, James Mastros, Jan +Dubois, Jay Hannah, Jerry D. Hedden, Jesse Vincent, Jim Cromie, Jirka +Hruška, John Peacock, Joshua ben Jore, Joshua Pritikin, Karl Williamson, +Kevin Ryde, kmx, Lars Dɪᴇᴄᴋᴏᴡ 迪拉斯, Larwan Berke, Leon Brocard, Leon +Timmermans, Lubomir Rintel, Lukas Mai, Maik Hentsche, Marty Pauley, +Marvin Humphrey, Matt Johnson, Matt S Trout, Max Maischein, Michael +Breen, Michael Fig, Michael G Schwern, Michael Parker, Michael Stevens, +Michael Witten, Mike Kelly, Moritz Lenz, Nicholas Clark, Nick Cleaton, +Nick Johnston, Nicolas Kaiser, Niko Tyni, Noirin Shirley, Nuno Carvalho, +Paul Evans, Paul Green, Paul Johnson, Paul Marquess, Peter J. Holzer, +Peter John Acklam, Peter Martini, Philippe Bruhat (BooK), Piotr Fusik, +Rafael Garcia-Suarez, Rainer Tammer, Reini Urban, Renee Baecker, Ricardo +Signes, Richard Möhn, Richard Soderberg, Rob Hoelz, Robin Barker, Ruslan +Zakirov, Salvador Fandiño, Salvador Ortiz Garcia, Shlomi Fish, Sinan +Unur, Sisyphus, Slaven Rezic, Steffen Müller, Steve Hay, Steven +Schubiger, Steve Peters, Sullivan Beck, Tatsuhiko Miyagawa, Tim Bunce, +Todd Rinaldo, Tom Christiansen, Tom Hukins, Tony Cook, Tye McQueen, +Vadim Konovalov, Vernon Lyon, Vincent Pit, Walt Mankowski, Wolfram +Humann, Yves Orton, Zefram, and Zsbán Ambrus. + +This is woefully incomplete as it's automatically generated from version +control history. In particular, it doesn't include the names of the +(very much appreciated) contributors who reported issues in previous +versions of Perl that helped make Perl 5.14.0 better. For a more complete +list of all of Perl's historical contributors, please see the C<AUTHORS> +file in the Perl 5.14.0 distribution. + +Many of the changes included in this version originated in the CPAN +modules included in Perl's core. We're grateful to the entire CPAN +community for helping Perl to flourish. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the Perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please send +it to perl5-security-report@perl.org. This points to a closed subscription +unarchived mailing list, which includes all the core committers, who are able +to help assess the impact of issues, figure out a resolution, and help +co-ordinate the release of patches to mitigate or fix the problem across all +platforms on which Perl is supported. Please use this address for +security issues in the Perl core I<only>, not for modules independently +distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5141delta.pod b/gnu/usr.bin/perl/pod/perl5141delta.pod new file mode 100644 index 00000000000..db24343c695 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5141delta.pod @@ -0,0 +1,361 @@ +=encoding utf8 + +=head1 NAME + +perl5141delta - what is new for perl v5.14.1 + +=head1 DESCRIPTION + +This document describes differences between the 5.14.0 release and +the 5.14.1 release. + +If you are upgrading from an earlier release such as 5.12.0, first read +L<perl5140delta>, which describes differences between 5.12.0 and +5.14.0. + +=head1 Core Enhancements + +No changes since 5.14.0. + +=head1 Security + +No changes since 5.14.0. + +=head1 Incompatible Changes + +There are no changes intentionally incompatible with 5.14.0. If any +exist, they are bugs and reports are welcome. + +=head1 Deprecations + +There have been no deprecations since 5.14.0. + +=head1 Modules and Pragmata + +=head2 New Modules and Pragmata + +None + +=head2 Updated Modules and Pragmata + +=over 4 + +=item * + +L<B::Deparse> has been upgraded from version 1.03 to 1.04, to address two +regressions in Perl 5.14.0: + +Deparsing of the C<glob> operator and its diamond (C<< <> >>) form now +works again. [perl #90898] + +The presence of subroutines named C<::::> or C<::::::> no longer causes +B::Deparse to hang. + +=item * + +L<Pod::Perldoc> has been upgraded from version 3.15_03 to 3.15_04. + +It corrects the search paths on VMS. [perl #90640] + +=back + +=head2 Removed Modules and Pragmata + +None + +=head1 Documentation + +=head2 New Documentation + +None + +=head2 Changes to Existing Documentation + +=head3 L<perlfunc> + +=over + +=item * + +C<given>, C<when> and C<default> are now listed in L<perlfunc>. + +=item * + +Documentation for C<use> now includes a pointer to F<if.pm>. + +=back + +=head3 L<perllol> + +=over + +=item * + +L<perllol> has been expanded with examples using the new C<push $scalar> +syntax introduced in Perl 5.14.0. + +=back + +=head3 L<perlop> + +=over 4 + +=item * + +The explanation of bitwise operators has been expanded to explain how they +work on Unicode strings. + +=item * + +The section on the triple-dot or yada-yada operator has been moved up, as +it used to separate two closely related sections about the comma operator. + +=item * + +More examples for C<m//g> have been added. + +=item * + +The C<<< <<\FOO >>> here-doc syntax has been documented. + +=back + +=head3 L<perlrun> + +=over + +=item * + +L<perlrun> has undergone a significant clean-up. Most notably, the +B<-0x...> form of the B<-0> flag has been clarified, and the final section +on environment variables has been corrected and expanded. + +=back + +=head3 L<POSIX> + +=over + +=item * + +The invocation documentation for C<WIFEXITED>, C<WEXITSTATUS>, +C<WIFSIGNALED>, C<WTERMSIG>, C<WIFSTOPPED>, and C<WSTOPSIG> was corrected. + +=back + + +=head1 Diagnostics + +The following additions or changes have been made to diagnostic output, +including warnings and fatal error messages. For the complete list of +diagnostic messages, see L<perldiag>. + +=head2 New Diagnostics + +None + +=head2 Changes to Existing Diagnostics + +None + +=head1 Utility Changes + +None + +=head1 Configuration and Compilation + +=over 4 + +=item * + +F<regexp.h> has been modified for compatibility with GCC's C<-Werror> +option, as used by some projects that include perl's header files. + +=back + +=head1 Testing + +=over 4 + +=item * + +Some test failures in F<dist/Locale-Maketext/t/09_compile.t> that could +occur depending on the environment have been fixed. [perl #89896] + +=item * + +A watchdog timer for F<t/re/re.t> was lengthened to accommodate SH-4 systems +which were unable to complete the tests before the previous timer ran out. + + +=back + +=head1 Platform Support + +=head2 New Platforms + +None + +=head2 Discontinued Platforms + +None + +=head2 Platform-Specific Notes + +=head3 Solaris + +=over + +=item * + +Documentation listing the Solaris packages required to build Perl on +Solaris 9 and Solaris 10 has been corrected. + +=back + +=head3 Mac OS X + +=over + +=item * + +The F<lib/locale.t> test script has been updated to work on the upcoming +Lion release. + +=item * + +Mac OS X specific compilation instructions have been clarified. + +=back + +=head3 Ubuntu Linux + +=over + +=item * + +The L<ODBM_File> installation process has been updated with the new library +paths on Ubuntu natty. + +=back + +=head1 Internal Changes + +=over + +=item * + +The compiled representation of formats is now stored via the mg_ptr of +their PERL_MAGIC_fm. Previously it was stored in the string buffer, +beyond SvLEN(), the regular end of the string. SvCOMPILED() and +SvCOMPILED_{on,off}() now exist solely for compatibility for XS code. +The first is always 0, the other two now no-ops. + +=back + +=head1 Bug Fixes + +=over 4 + +=item * + +A bug has been fixed that would cause a "Use of freed value in iteration" +error if the next two hash elements that would be iterated over are +deleted. [perl #85026] + +=item * + +Passing the same constant subroutine to both C<index> and C<formline> no +longer causes one or the other to fail. [perl #89218] + +=item * + +5.14.0 introduced some memory leaks in regular expression character +classes such as C<[\w\s]>, which have now been fixed. + +=item * + +An edge case in regular expression matching could potentially loop. +This happened only under C</i> in bracketed character classes that have +characters with multi-character folds, and the target string to match +against includes the first portion of the fold, followed by another +character that has a multi-character fold that begins with the remaining +portion of the fold, plus some more. + + "s\N{U+DF}" =~ /[\x{DF}foo]/i + +is one such case. C<\xDF> folds to C<"ss">. + +=item * + +Several Unicode case-folding bugs have been fixed. + +=item * + +The new (in 5.14.0) regular expression modifier C</a> when repeated like +C</aa> forbids the characters outside the ASCII range that match +characters inside that range from matching under C</i>. This did not +work under some circumstances, all involving alternation, such as: + + "\N{KELVIN SIGN}" =~ /k|foo/iaa; + +succeeded inappropriately. This is now fixed. + +=item * + +Fixed a case where it was possible that a freed buffer may have been read +from when parsing a here document. + +=back + +=head1 Acknowledgements + +Perl 5.14.1 represents approximately four weeks of development since +Perl 5.14.0 and contains approximately 3500 lines of changes +across 38 files from 17 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.14.1: + +Bo Lindbergh, Claudio Ramirez, Craig A. Berry, David Leadbeater, Father +Chrysostomos, Jesse Vincent, Jim Cromie, Justin Case, Karl Williamson, +Leo Lapworth, Nicholas Clark, Nobuhiro Iwamatsu, smash, Tom Christiansen, +Ton Hospel, Vladimir Timofeev, and Zsbán Ambrus. + + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please send +it to perl5-security-report@perl.org. This points to a closed subscription +unarchived mailing list, which includes all the core committers, who be able +to help assess the impact of issues, figure out a resolution, and help +co-ordinate the release of patches to mitigate or fix the problem across all +platforms on which Perl is supported. Please only use this address for +security issues in the Perl core, not for modules independently +distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5142delta.pod b/gnu/usr.bin/perl/pod/perl5142delta.pod new file mode 100644 index 00000000000..3d78ba232bd --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5142delta.pod @@ -0,0 +1,242 @@ +=encoding utf8 + +=head1 NAME + +perl5142delta - what is new for perl v5.14.2 + +=head1 DESCRIPTION + +This document describes differences between the 5.14.1 release and +the 5.14.2 release. + +If you are upgrading from an earlier release such as 5.14.0, first read +L<perl5141delta>, which describes differences between 5.14.0 and +5.14.1. + +=head1 Core Enhancements + +No changes since 5.14.0. + +=head1 Security + +=head2 C<File::Glob::bsd_glob()> memory error with GLOB_ALTDIRFUNC (CVE-2011-2728). + +Calling C<File::Glob::bsd_glob> with the unsupported flag GLOB_ALTDIRFUNC would +cause an access violation / segfault. A Perl program that accepts a flags value from +an external source could expose itself to denial of service or arbitrary code +execution attacks. There are no known exploits in the wild. The problem has been +corrected by explicitly disabling all unsupported flags and setting unused function +pointers to null. Bug reported by Clément Lecigne. + +=head2 C<Encode> decode_xs n-byte heap-overflow (CVE-2011-2939) + +A bug in C<Encode> could, on certain inputs, cause the heap to overflow. +This problem has been corrected. Bug reported by Robert Zacek. + +=head1 Incompatible Changes + +There are no changes intentionally incompatible with 5.14.0. If any +exist, they are bugs and reports are welcome. + +=head1 Deprecations + +There have been no deprecations since 5.14.0. + +=head1 Modules and Pragmata + +=head2 New Modules and Pragmata + +None + +=head2 Updated Modules and Pragmata + +=over 4 + +=item * + +L<CPAN> has been upgraded from version 1.9600 to version 1.9600_01. + +L<CPAN::Distribution> has been upgraded from version 1.9602 to 1.9602_01. + +Backported bugfixes from CPAN version 1.9800. Ensures proper +detection of C<configure_requires> prerequisites from CPAN Meta files +in the case where C<dynamic_config> is true. [rt.cpan.org #68835] + +Also ensures that C<configure_requires> is only checked in META files, +not MYMETA files, so protect against MYMETA generation that drops +C<configure_requires>. + +=item * + +L<Encode> has been upgraded from version 2.42 to 2.42_01. + +See L</Security>. + +=item * + +L<File::Glob> has been upgraded from version 1.12 to version 1.13. + +See L</Security>. + +=item * + +L<PerlIO::scalar> has been upgraded from version 0.11 to 0.11_01. + +It fixes a problem with C<< open my $fh, ">", \$scalar >> not working if +C<$scalar> is a copy-on-write scalar. + +=back + +=head2 Removed Modules and Pragmata + +None + +=head1 Platform Support + +=head2 New Platforms + +None + +=head2 Discontinued Platforms + +None + +=head2 Platform-Specific Notes + +=over 4 + +=item HP-UX PA-RISC/64 now supports gcc-4.x + +A fix to correct the socketsize now makes the test suite pass on HP-UX +PA-RISC for 64bitall builds. + +=item Building on OS X 10.7 Lion and Xcode 4 works again + +The build system has been updated to work with the build tools under Mac OS X +10.7. + +=back + +=head1 Bug Fixes + +=over 4 + +=item * + +In @INC filters (subroutines returned by subroutines in @INC), $_ used to +misbehave: If returned from a subroutine, it would not be copied, but the +variable itself would be returned; and freeing $_ (e.g., with C<undef *_>) +would cause perl to crash. This has been fixed [perl #91880]. + +=item * + +Perl 5.10.0 introduced some faulty logic that made "U*" in the middle of +a pack template equivalent to "U0" if the input string was empty. This has +been fixed [perl #90160]. + +=item * + +C<caller> no longer leaks memory when called from the DB package if +C<@DB::args> was assigned to after the first call to C<caller>. L<Carp> +was triggering this bug [perl #97010]. + +=item * + +C<utf8::decode> had a nasty bug that would modify copy-on-write scalars' +string buffers in place (i.e., skipping the copy). This could result in +hashes having two elements with the same key [perl #91834]. + +=item * + +Localising a tied variable used to make it read-only if it contained a +copy-on-write string. + +=item * + +Elements of restricted hashes (see the L<fields> pragma) containing +copy-on-write values couldn't be deleted, nor could such hashes be cleared +(C<%hash = ()>). + +=item * + +Locking a hash element that is a glob copy no longer causes subsequent +assignment to it to corrupt the glob. + +=item * + +A panic involving the combination of the regular expression modifiers +C</aa> introduced in 5.14.0 and the C<\b> escape sequence has been +fixed [perl #95964]. + +=back + +=head1 Known Problems + +This is a list of some significant unfixed bugs, which are regressions +from 5.12.0. + +=over 4 + +=item * + +C<PERL_GLOBAL_STRUCT> is broken. + +Since perl 5.14.0, building with C<-DPERL_GLOBAL_STRUCT> hasn't been +possible. This means that perl currently doesn't work on any platforms that +require it to be built this way, including Symbian. + +While C<PERL_GLOBAL_STRUCT> now works again on recent development versions of +perl, it actually working on Symbian again hasn't been verified. + +We'd be very interested in hearing from anyone working with Perl on Symbian. + +=back + +=head1 Acknowledgements + +Perl 5.14.2 represents approximately three months of development since +Perl 5.14.1 and contains approximately 1200 lines of changes +across 61 files from 9 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.14.2: + +Craig A. Berry, David Golden, Father Chrysostomos, Florian Ragwitz, H.Merijn +Brand, Karl Williamson, Nicholas Clark, Pau Amma and Ricardo Signes. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please send +it to perl5-security-report@perl.org. This points to a closed subscription +unarchived mailing list, which includes all the core committers, who be able +to help assess the impact of issues, figure out a resolution, and help +co-ordinate the release of patches to mitigate or fix the problem across all +platforms on which Perl is supported. Please only use this address for +security issues in the Perl core, not for modules independently +distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5143delta.pod b/gnu/usr.bin/perl/pod/perl5143delta.pod new file mode 100644 index 00000000000..093b6272dcb --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5143delta.pod @@ -0,0 +1,291 @@ +=encoding utf8 + +=head1 NAME + +perl5143delta - what is new for perl v5.14.3 + +=head1 DESCRIPTION + +This document describes differences between the 5.14.2 release and +the 5.14.3 release. + +If you are upgrading from an earlier release such as 5.12.0, first read +L<perl5140delta>, which describes differences between 5.12.0 and +5.14.0. + +=head1 Core Enhancements + +No changes since 5.14.0. + +=head1 Security + +=head2 C<Digest> unsafe use of eval (CVE-2011-3597) + +The C<Digest-E<gt>new()> function did not properly sanitize input before +using it in an eval() call, which could lead to the injection of arbitrary +Perl code. + +In order to exploit this flaw, the attacker would need to be able to set +the algorithm name used, or be able to execute arbitrary Perl code already. + +This problem has been fixed. + +=head2 Heap buffer overrun in 'x' string repeat operator (CVE-2012-5195) + +Poorly written perl code that allows an attacker to specify the count to +perl's 'x' string repeat operator can already cause a memory exhaustion +denial-of-service attack. A flaw in versions of perl before 5.15.5 can +escalate that into a heap buffer overrun; coupled with versions of glibc +before 2.16, it possibly allows the execution of arbitrary code. + +This problem has been fixed. + +=head1 Incompatible Changes + +There are no changes intentionally incompatible with 5.14.0. If any +exist, they are bugs and reports are welcome. + +=head1 Deprecations + +There have been no deprecations since 5.14.0. + +=head1 Modules and Pragmata + +=head2 New Modules and Pragmata + +None + +=head2 Updated Modules and Pragmata + +=over 4 + +=item * + +L<PerlIO::scalar> was updated to fix a bug in which opening a filehandle to +a glob copy caused assertion failures (under debugging) or hangs or other +erratic behaviour without debugging. + +=item * + +L<ODBM_File> and L<NDBM_File> were updated to allow building on GNU/Hurd. + +=item * + +L<IPC::Open3> has been updated to fix a regression introduced in perl +5.12, which broke C<IPC::Open3::open3($in, $out, $err, '-')>. +[perl #95748] + +=item * + +L<Digest> has been upgraded from version 1.16 to 1.16_01. + +See L</Security>. + +=item * + +L<Module::CoreList> has been updated to version 2.49_04 to add data for +this release. + +=back + +=head2 Removed Modules and Pragmata + +None + +=head1 Documentation + +=head2 New Documentation + +None + +=head2 Changes to Existing Documentation + +=head3 L<perlcheat> + +=over 4 + +=item * + +L<perlcheat> was updated to 5.14. + +=back + +=head1 Configuration and Compilation + +=over 4 + +=item * + +h2ph was updated to search correctly gcc include directories on platforms +such as Debian with multi-architecture support. + +=item * + +In Configure, the test for procselfexe was refactored into a loop. + +=back + +=head1 Platform Support + +=head2 New Platforms + +None + +=head2 Discontinued Platforms + +None + +=head2 Platform-Specific Notes + +=over 4 + +=item FreeBSD + +The FreeBSD hints file was corrected to be compatible with FreeBSD 10.0. + +=item Solaris and NetBSD + +Configure was updated for "procselfexe" support on Solaris and NetBSD. + +=item HP-UX + +README.hpux was updated to note the existence of a broken header in +HP-UX 11.00. + +=item Linux + +libutil is no longer used when compiling on Linux platforms, which avoids +warnings being emitted. + +The system gcc (rather than any other gcc which might be in the compiling +user's path) is now used when searching for libraries such as C<-lm>. + +=item Mac OS X + +The locale tests were updated to reflect the behaviour of locales in +Mountain Lion. + +=item GNU/Hurd + +Various build and test fixes were included for GNU/Hurd. + +LFS support was enabled in GNU/Hurd. + +=item NetBSD + +The NetBSD hints file was corrected to be compatible with NetBSD 6.* + +=back + +=head1 Bug Fixes + +=over 4 + +=item * + +A regression has been fixed that was introduced in 5.14, in C</i> +regular expression matching, in which a match improperly fails if the +pattern is in UTF-8, the target string is not, and a Latin-1 character +precedes a character in the string that should match the pattern. [perl +#101710] + +=item * + +In case-insensitive regular expression pattern matching, no longer on +UTF-8 encoded strings does the scan for the start of match only look at +the first possible position. This caused matches such as +C<"f\x{FB00}" =~ /ff/i> to fail. + +=item * + +The sitecustomize support was made relocatableinc aware, so that +-Dusesitecustomize and -Duserelocatableinc may be used together. + +=item * + +The smartmatch operator (C<~~>) was changed so that the right-hand side +takes precedence during C<Any ~~ Object> operations. + +=item * + +A bug has been fixed in the tainting support, in which an C<index()> +operation on a tainted constant would cause all other constants to become +tainted. [perl #64804] + +=item * + +A regression has been fixed that was introduced in perl 5.12, whereby +tainting errors were not correctly propagated through C<die()>. +[perl #111654] + +=item * + +A regression has been fixed that was introduced in perl 5.14, in which +C</[[:lower:]]/i> and C</[[:upper:]]/i> no longer matched the opposite case. +[perl #101970] + +=back + +=head1 Acknowledgements + +Perl 5.14.3 represents approximately 12 months of development since Perl 5.14.2 +and contains approximately 2,300 lines of changes across 64 files from 22 +authors. + +Perl continues to flourish into its third decade thanks to a vibrant community +of users and developers. The following people are known to have contributed the +improvements that became Perl 5.14.3: + +Abigail, Andy Dougherty, Carl Hayter, Chris 'BinGOs' Williams, Dave Rolsky, +David Mitchell, Dominic Hargreaves, Father Chrysostomos, Florian Ragwitz, +H.Merijn Brand, Jilles Tjoelker, Karl Williamson, Leon Timmermans, Michael G +Schwern, Nicholas Clark, Niko Tyni, Pino Toscano, Ricardo Signes, Salvador +Fandiño, Samuel Thibault, Steve Hay, Tony Cook. + +The list above is almost certainly incomplete as it is automatically generated +from version control history. In particular, it does not include the names of +the (very much appreciated) contributors who reported issues to the Perl bug +tracker. + +Many of the changes included in this version originated in the CPAN modules +included in Perl's core. We're grateful to the entire CPAN community for +helping Perl to flourish. + +For a more complete list of all of Perl's historical contributors, please see +the F<AUTHORS> file in the Perl source distribution. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please send +it to perl5-security-report@perl.org. This points to a closed subscription +unarchived mailing list, which includes all the core committers, who be able +to help assess the impact of issues, figure out a resolution, and help +co-ordinate the release of patches to mitigate or fix the problem across all +platforms on which Perl is supported. Please only use this address for +security issues in the Perl core, not for modules independently +distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5160delta.pod b/gnu/usr.bin/perl/pod/perl5160delta.pod new file mode 100644 index 00000000000..9b67d17a243 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5160delta.pod @@ -0,0 +1,4314 @@ +=encoding utf8 + +=head1 NAME + +perl5160delta - what is new for perl v5.16.0 + +=head1 DESCRIPTION + +This document describes differences between the 5.14.0 release and +the 5.16.0 release. + +If you are upgrading from an earlier release such as 5.12.0, first read +L<perl5140delta>, which describes differences between 5.12.0 and +5.14.0. + +Some bug fixes in this release have been backported to later +releases of 5.14.x. Those are indicated with the 5.14.x version in +parentheses. + +=head1 Notice + +With the release of Perl 5.16.0, the 5.12.x series of releases is now out of +its support period. There may be future 5.12.x releases, but only in the +event of a critical security issue. Users of Perl 5.12 or earlier should +consider upgrading to a more recent release of Perl. + +This policy is described in greater detail in +L<perlpolicy|perlpolicy/MAINTENANCE AND SUPPORT>. + +=head1 Core Enhancements + +=head2 C<use I<VERSION>> + +As of this release, version declarations like C<use v5.16> now disable +all features before enabling the new feature bundle. This means that +the following holds true: + + use 5.016; + # only 5.16 features enabled here + use 5.014; + # only 5.14 features enabled here (not 5.16) + +C<use v5.12> and higher continue to enable strict, but explicit C<use +strict> and C<no strict> now override the version declaration, even +when they come first: + + no strict; + use 5.012; + # no strict here + +There is a new ":default" feature bundle that represents the set of +features enabled before any version declaration or C<use feature> has +been seen. Version declarations below 5.10 now enable the ":default" +feature set. This does not actually change the behavior of C<use +v5.8>, because features added to the ":default" set are those that were +traditionally enabled by default, before they could be turned off. + +C<< no feature >> now resets to the default feature set. To disable all +features (which is likely to be a pretty special-purpose request, since +it presumably won't match any named set of semantics) you can now +write C<< no feature ':all' >>. + +C<$[> is now disabled under C<use v5.16>. It is part of the default +feature set and can be turned on or off explicitly with C<use feature +'array_base'>. + +=head2 C<__SUB__> + +The new C<__SUB__> token, available under the C<current_sub> feature +(see L<feature>) or C<use v5.16>, returns a reference to the current +subroutine, making it easier to write recursive closures. + +=head2 New and Improved Built-ins + +=head3 More consistent C<eval> + +The C<eval> operator sometimes treats a string argument as a sequence of +characters and sometimes as a sequence of bytes, depending on the +internal encoding. The internal encoding is not supposed to make any +difference, but there is code that relies on this inconsistency. + +The new C<unicode_eval> and C<evalbytes> features (enabled under C<use +5.16.0>) resolve this. The C<unicode_eval> feature causes C<eval +$string> to treat the string always as Unicode. The C<evalbytes> +features provides a function, itself called C<evalbytes>, which +evaluates its argument always as a string of bytes. + +These features also fix oddities with source filters leaking to outer +dynamic scopes. + +See L<feature> for more detail. + +=head3 C<substr> lvalue revamp + +=for comment Does this belong here, or under Incompatible Changes? + +When C<substr> is called in lvalue or potential lvalue context with two +or three arguments, a special lvalue scalar is returned that modifies +the original string (the first argument) when assigned to. + +Previously, the offsets (the second and third arguments) passed to +C<substr> would be converted immediately to match the string, negative +offsets being translated to positive and offsets beyond the end of the +string being truncated. + +Now, the offsets are recorded without modification in the special +lvalue scalar that is returned, and the original string is not even +looked at by C<substr> itself, but only when the returned lvalue is +read or modified. + +These changes result in an incompatible change: + +If the original string changes length after the call to C<substr> but +before assignment to its return value, negative offsets will remember +their position from the end of the string, affecting code like this: + + my $string = "string"; + my $lvalue = \substr $string, -4, 2; + print $$lvalue, "\n"; # prints "ri" + $string = "bailing twine"; + print $$lvalue, "\n"; # prints "wi"; used to print "il" + +The same thing happens with an omitted third argument. The returned +lvalue will always extend to the end of the string, even if the string +becomes longer. + +Since this change also allowed many bugs to be fixed (see +L</The C<substr> operator>), and since the behavior +of negative offsets has never been specified, the +change was deemed acceptable. + +=head3 Return value of C<tied> + +The value returned by C<tied> on a tied variable is now the actual +scalar that holds the object to which the variable is tied. This +lets ties be weakened with C<Scalar::Util::weaken(tied +$tied_variable)>. + +=head2 Unicode Support + +=head3 Supports (I<almost>) Unicode 6.1 + +Besides the addition of whole new scripts, and new characters in +existing scripts, this new version of Unicode, as always, makes some +changes to existing characters. One change that may trip up some +applications is that the General Category of two characters in the +Latin-1 range, PILCROW SIGN and SECTION SIGN, has been changed from +Other_Symbol to Other_Punctuation. The same change has been made for +a character in each of Tibetan, Ethiopic, and Aegean. +The code points U+3248..U+324F (CIRCLED NUMBER TEN ON BLACK SQUARE +through CIRCLED NUMBER EIGHTY ON BLACK SQUARE) have had their General +Category changed from Other_Symbol to Other_Numeric. The Line Break +property has changes for Hebrew and Japanese; and because of +other changes in 6.1, the Perl regular expression construct C<\X> now +works differently for some characters in Thai and Lao. + +New aliases (synonyms) have been defined for many property values; +these, along with the previously existing ones, are all cross-indexed in +L<perluniprops>. + +The return value of C<charnames::viacode()> is affected by other +changes: + + Code point Old Name New Name + U+000A LINE FEED (LF) LINE FEED + U+000C FORM FEED (FF) FORM FEED + U+000D CARRIAGE RETURN (CR) CARRIAGE RETURN + U+0085 NEXT LINE (NEL) NEXT LINE + U+008E SINGLE-SHIFT 2 SINGLE-SHIFT-2 + U+008F SINGLE-SHIFT 3 SINGLE-SHIFT-3 + U+0091 PRIVATE USE 1 PRIVATE USE-1 + U+0092 PRIVATE USE 2 PRIVATE USE-2 + U+2118 SCRIPT CAPITAL P WEIERSTRASS ELLIPTIC FUNCTION + +Perl will accept any of these names as input, but +C<charnames::viacode()> now returns the new name of each pair. The +change for U+2118 is considered by Unicode to be a correction, that is +the original name was a mistake (but again, it will remain forever valid +to use it to refer to U+2118). But most of these changes are the +fallout of the mistake Unicode 6.0 made in naming a character used in +Japanese cell phones to be "BELL", which conflicts with the longstanding +industry use of (and Unicode's recommendation to use) that name +to mean the ASCII control character at U+0007. Therefore, that name +has been deprecated in Perl since v5.14, and any use of it will raise a +warning message (unless turned off). The name "ALERT" is now the +preferred name for this code point, with "BEL" an acceptable short +form. The name for the new cell phone character, at code point U+1F514, +remains undefined in this version of Perl (hence we don't +implement quite all of Unicode 6.1), but starting in v5.18, BELL will mean +this character, and not U+0007. + +Unicode has taken steps to make sure that this sort of mistake does not +happen again. The Standard now includes all generally accepted +names and abbreviations for control characters, whereas previously it +didn't (though there were recommended names for most of them, which Perl +used). This means that most of those recommended names are now +officially in the Standard. Unicode did not recommend names for the +four code points listed above between U+008E and U+008F, and in +standardizing them Unicode subtly changed the names that Perl had +previously given them, by replacing the final blank in each name by a +hyphen. Unicode also officially accepts names that Perl had deprecated, +such as FILE SEPARATOR. Now the only deprecated name is BELL. +Finally, Perl now uses the new official names instead of the old +(now considered obsolete) names for the first four code points in the +list above (the ones which have the parentheses in them). + +Now that the names have been placed in the Unicode standard, these kinds +of changes should not happen again, though corrections, such as to +U+2118, are still possible. + +Unicode also added some name abbreviations, which Perl now accepts: +SP for SPACE; +TAB for CHARACTER TABULATION; +NEW LINE, END OF LINE, NL, and EOL for LINE FEED; +LOCKING-SHIFT ONE for SHIFT OUT; +LOCKING-SHIFT ZERO for SHIFT IN; +and ZWNBSP for ZERO WIDTH NO-BREAK SPACE. + +More details on this version of Unicode are provided in +L<http://www.unicode.org/versions/Unicode6.1.0/>. + +=head3 C<use charnames> is no longer needed for C<\N{I<name>}> + +When C<\N{I<name>}> is encountered, the C<charnames> module is now +automatically loaded when needed as if the C<:full> and C<:short> +options had been specified. See L<charnames> for more information. + +=head3 C<\N{...}> can now have Unicode loose name matching + +This is described in the C<charnames> item in +L</Updated Modules and Pragmata> below. + +=head3 Unicode Symbol Names + +Perl now has proper support for Unicode in symbol names. It used to be +that C<*{$foo}> would ignore the internal UTF8 flag and use the bytes of +the underlying representation to look up the symbol. That meant that +C<*{"\x{100}"}> and C<*{"\xc4\x80"}> would return the same thing. All +these parts of Perl have been fixed to account for Unicode: + +=over + +=item * + +Method names (including those passed to C<use overload>) + +=item * + +Typeglob names (including names of variables, subroutines, and filehandles) + +=item * + +Package names + +=item * + +C<goto> + +=item * + +Symbolic dereferencing + +=item * + +Second argument to C<bless()> and C<tie()> + +=item * + +Return value of C<ref()> + +=item * + +Subroutine prototypes + +=item * + +Attributes + +=item * + +Various warnings and error messages that mention variable names or values, +methods, etc. + +=back + +In addition, a parsing bug has been fixed that prevented C<*{é}> from +implicitly quoting the name, but instead interpreted it as C<*{+é}>, which +would cause a strict violation. + +C<*{"*a::b"}> automatically strips off the * if it is followed by an ASCII +letter. That has been extended to all Unicode identifier characters. + +One-character non-ASCII non-punctuation variables (like C<$é>) are now +subject to "Used only once" warnings. They used to be exempt, as they +were treated as punctuation variables. + +Also, single-character Unicode punctuation variables (like C<$‰>) are now +supported [perl #69032]. + +=head3 Improved ability to mix locales and Unicode, including UTF-8 locales + +An optional parameter has been added to C<use locale> + + use locale ':not_characters'; + +which tells Perl to use all but the C<LC_CTYPE> and C<LC_COLLATE> +portions of the current locale. Instead, the character set is assumed +to be Unicode. This lets locales and Unicode be seamlessly mixed, +including the increasingly frequent UTF-8 locales. When using this +hybrid form of locales, the C<:locale> layer to the L<open> pragma can +be used to interface with the file system, and there are CPAN modules +available for ARGV and environment variable conversions. + +Full details are in L<perllocale>. + +=head3 New function C<fc> and corresponding escape sequence C<\F> for Unicode foldcase + +Unicode foldcase is an extension to lowercase that gives better results +when comparing two strings case-insensitively. It has long been used +internally in regular expression C</i> matching. Now it is available +explicitly through the new C<fc> function call (enabled by +S<C<"use feature 'fc'">>, or C<use v5.16>, or explicitly callable via +C<CORE::fc>) or through the new C<\F> sequence in double-quotish +strings. + +Full details are in L<perlfunc/fc>. + +=head3 The Unicode C<Script_Extensions> property is now supported. + +New in Unicode 6.0, this is an improved C<Script> property. Details +are in L<perlunicode/Scripts>. + +=head2 XS Changes + +=head3 Improved typemaps for Some Builtin Types + +Most XS authors will know there is a longstanding bug in the +OUTPUT typemap for T_AVREF (C<AV*>), T_HVREF (C<HV*>), T_CVREF (C<CV*>), +and T_SVREF (C<SVREF> or C<\$foo>) that requires manually decrementing +the reference count of the return value instead of the typemap taking +care of this. For backwards-compatibility, this cannot be changed in the +default typemaps. But we now provide additional typemaps +C<T_AVREF_REFCOUNT_FIXED>, etc. that do not exhibit this bug. Using +them in your extension is as simple as having one line in your +C<TYPEMAP> section: + + HV* T_HVREF_REFCOUNT_FIXED + +=head3 C<is_utf8_char()> + +The XS-callable function C<is_utf8_char()>, when presented with +malformed UTF-8 input, can read up to 12 bytes beyond the end of the +string. This cannot be fixed without changing its API, and so its +use is now deprecated. Use C<is_utf8_char_buf()> (described just below) +instead. + +=head3 Added C<is_utf8_char_buf()> + +This function is designed to replace the deprecated L</is_utf8_char()> +function. It includes an extra parameter to make sure it doesn't read +past the end of the input buffer. + +=head3 Other C<is_utf8_foo()> functions, as well as C<utf8_to_foo()>, etc. + +Most other XS-callable functions that take UTF-8 encoded input +implicitly assume that the UTF-8 is valid (not malformed) with respect to +buffer length. Do not do things such as change a character's case or +see if it is alphanumeric without first being sure that it is valid +UTF-8. This can be safely done for a whole string by using one of the +functions C<is_utf8_string()>, C<is_utf8_string_loc()>, and +C<is_utf8_string_loclen()>. + +=head3 New Pad API + +Many new functions have been added to the API for manipulating lexical +pads. See L<perlapi/Pad Data Structures> for more information. + +=head2 Changes to Special Variables + +=head3 C<$$> can be assigned to + +C<$$> was made read-only in Perl 5.8.0. But only sometimes: C<local $$> +would make it writable again. Some CPAN modules were using C<local $$> or +XS code to bypass the read-only check, so there is no reason to keep C<$$> +read-only. (This change also allowed a bug to be fixed while maintaining +backward compatibility.) + +=head3 C<$^X> converted to an absolute path on FreeBSD, OS X and Solaris + +C<$^X> is now converted to an absolute path on OS X, FreeBSD (without +needing F</proc> mounted) and Solaris 10 and 11. This augments the +previous approach of using F</proc> on Linux, FreeBSD, and NetBSD +(in all cases, where mounted). + +This makes relocatable perl installations more useful on these platforms. +(See "Relocatable @INC" in F<INSTALL>) + +=head2 Debugger Changes + +=head3 Features inside the debugger + +The current Perl's L<feature> bundle is now enabled for commands entered +in the interactive debugger. + +=head3 New option for the debugger's B<t> command + +The B<t> command in the debugger, which toggles tracing mode, now +accepts a numeric argument that determines how many levels of subroutine +calls to trace. + +=head3 C<enable> and C<disable> + +The debugger now has C<disable> and C<enable> commands for disabling +existing breakpoints and re-enabling them. See L<perldebug>. + +=head3 Breakpoints with file names + +The debugger's "b" command for setting breakpoints now lets a line +number be prefixed with a file name. See +L<perldebug/"b [file]:[line] [condition]">. + +=head2 The C<CORE> Namespace + +=head3 The C<CORE::> prefix + +The C<CORE::> prefix can now be used on keywords enabled by +L<feature.pm|feature>, even outside the scope of C<use feature>. + +=head3 Subroutines in the C<CORE> namespace + +Many Perl keywords are now available as subroutines in the CORE namespace. +This lets them be aliased: + + BEGIN { *entangle = \&CORE::tie } + entangle $variable, $package, @args; + +And for prototypes to be bypassed: + + sub mytie(\[%$*@]$@) { + my ($ref, $pack, @args) = @_; + ... do something ... + goto &CORE::tie; + } + +Some of these cannot be called through references or via C<&foo> syntax, +but must be called as barewords. + +See L<CORE> for details. + +=head2 Other Changes + +=head3 Anonymous handles + +Automatically generated file handles are now named __ANONIO__ when the +variable name cannot be determined, rather than $__ANONIO__. + +=head3 Autoloaded sort Subroutines + +Custom sort subroutines can now be autoloaded [perl #30661]: + + sub AUTOLOAD { ... } + @sorted = sort foo @list; # uses AUTOLOAD + +=head3 C<continue> no longer requires the "switch" feature + +The C<continue> keyword has two meanings. It can introduce a C<continue> +block after a loop, or it can exit the current C<when> block. Up to now, +the latter meaning was valid only with the "switch" feature enabled, and +was a syntax error otherwise. Since the main purpose of feature.pm is to +avoid conflicts with user-defined subroutines, there is no reason for +C<continue> to depend on it. + +=head3 DTrace probes for interpreter phase change + +The C<phase-change> probes will fire when the interpreter's phase +changes, which tracks the C<${^GLOBAL_PHASE}> variable. C<arg0> is +the new phase name; C<arg1> is the old one. This is useful +for limiting your instrumentation to one or more of: compile time, +run time, or destruct time. + +=head3 C<__FILE__()> Syntax + +The C<__FILE__>, C<__LINE__> and C<__PACKAGE__> tokens can now be written +with an empty pair of parentheses after them. This makes them parse the +same way as C<time>, C<fork> and other built-in functions. + +=head3 The C<\$> prototype accepts any scalar lvalue + +The C<\$> and C<\[$]> subroutine prototypes now accept any scalar lvalue +argument. Previously they accepted only scalars beginning with C<$> and +hash and array elements. This change makes them consistent with the way +the built-in C<read> and C<recv> functions (among others) parse their +arguments. This means that one can override the built-in functions with +custom subroutines that parse their arguments the same way. + +=head3 C<_> in subroutine prototypes + +The C<_> character in subroutine prototypes is now allowed before C<@> or +C<%>. + +=head1 Security + +=head2 Use C<is_utf8_char_buf()> and not C<is_utf8_char()> + +The latter function is now deprecated because its API is insufficient to +guarantee that it doesn't read (up to 12 bytes in the worst case) beyond +the end of its input string. See +L<is_utf8_char_buf()|/Added is_utf8_char_buf()>. + +=head2 Malformed UTF-8 input could cause attempts to read beyond the end of the buffer + +Two new XS-accessible functions, C<utf8_to_uvchr_buf()> and +C<utf8_to_uvuni_buf()> are now available to prevent this, and the Perl +core has been converted to use them. +See L</Internal Changes>. + +=head2 C<File::Glob::bsd_glob()> memory error with GLOB_ALTDIRFUNC (CVE-2011-2728). + +Calling C<File::Glob::bsd_glob> with the unsupported flag +GLOB_ALTDIRFUNC would cause an access violation / segfault. A Perl +program that accepts a flags value from an external source could expose +itself to denial of service or arbitrary code execution attacks. There +are no known exploits in the wild. The problem has been corrected by +explicitly disabling all unsupported flags and setting unused function +pointers to null. Bug reported by Clément Lecigne. (5.14.2) + +=head2 Privileges are now set correctly when assigning to C<$(> + +A hypothetical bug (probably unexploitable in practice) because the +incorrect setting of the effective group ID while setting C<$(> has been +fixed. The bug would have affected only systems that have C<setresgid()> +but not C<setregid()>, but no such systems are known to exist. + +=head1 Deprecations + +=head2 Don't read the Unicode data base files in F<lib/unicore> + +It is now deprecated to directly read the Unicode data base files. +These are stored in the F<lib/unicore> directory. Instead, you should +use the new functions in L<Unicode::UCD>. These provide a stable API, +and give complete information. + +Perl may at some point in the future change or remove these files. The +file which applications were most likely to have used is +F<lib/unicore/ToDigit.pl>. L<Unicode::UCD/prop_invmap()> can be used to +get at its data instead. + +=head2 XS functions C<is_utf8_char()>, C<utf8_to_uvchr()> and +C<utf8_to_uvuni()> + +This function is deprecated because it could read beyond the end of the +input string. Use the new L<is_utf8_char_buf()|/Added is_utf8_char_buf()>, +C<utf8_to_uvchr_buf()> and C<utf8_to_uvuni_buf()> instead. + +=head1 Future Deprecations + +This section serves as a notice of features that are I<likely> to be +removed or L<deprecated|perlpolicy/deprecated> in the next release of +perl (5.18.0). If your code depends on these features, you should +contact the Perl 5 Porters via the L<mailing +list|http://lists.perl.org/list/perl5-porters.html> or L<perlbug> to +explain your use case and inform the deprecation process. + +=head2 Core Modules + +These modules may be marked as deprecated I<from the core>. This only +means that they will no longer be installed by default with the core +distribution, but will remain available on the CPAN. + +=over + +=item * + +CPANPLUS + +=item * + +Filter::Simple + +=item * + +PerlIO::mmap + +=item * + +Pod::LaTeX + +=item * + +Pod::Parser + +=item * + +SelfLoader + +=item * + +Text::Soundex + +=item * + +Thread.pm + +=back + +=head2 Platforms with no supporting programmers: + +These platforms will probably have their +special build support removed during the +5.17.0 development series. + +=over + +=item * + +BeOS + +=item * + +djgpp + +=item * + +dgux + +=item * + +EPOC + +=item * + +MPE/iX + +=item * + +Rhapsody + +=item * + +UTS + +=item * + +VM/ESA + +=back + +=head2 Other Future Deprecations + +=over + +=item * + +Swapping of $< and $> + +For more information about this future deprecation, see L<the relevant RT +ticket|https://rt.perl.org/rt3/Ticket/Display.html?id=96212>. + +=item * + +sfio, stdio + +Perl supports being built without PerlIO proper, using a stdio or sfio +wrapper instead. A perl build like this will not support IO layers and +thus Unicode IO, making it rather handicapped. + +PerlIO supports a C<stdio> layer if stdio use is desired, and similarly a +sfio layer could be produced. + +=item * + +Unescaped literal C<< "{" >> in regular expressions. + +Starting with v5.20, it is planned to require a literal C<"{"> to be +escaped, for example by preceding it with a backslash. In v5.18, a +deprecated warning message will be emitted for all such uses. +This affects only patterns that are to match a literal C<"{">. Other +uses of this character, such as part of a quantifier or sequence as in +those below, are completely unaffected: + + /foo{3,5}/ + /\p{Alphabetic}/ + /\N{DIGIT ZERO} + +Removing this will permit extensions to Perl's pattern syntax and better +error checking for existing syntax. See L<perlre/Quantifiers> for an +example. + +=item * + +Revamping C<< "\Q" >> semantics in double-quotish strings when combined with other escapes. + +There are several bugs and inconsistencies involving combinations +of C<\Q> and escapes like C<\x>, C<\L>, etc., within a C<\Q...\E> pair. +These need to be fixed, and doing so will necessarily change current +behavior. The changes have not yet been settled. + +=back + +=head1 Incompatible Changes + +=head2 Special blocks called in void context + +Special blocks (C<BEGIN>, C<CHECK>, C<INIT>, C<UNITCHECK>, C<END>) are now +called in void context. This avoids wasteful copying of the result of the +last statement [perl #108794]. + +=head2 The C<overloading> pragma and regexp objects + +With C<no overloading>, regular expression objects returned by C<qr//> are +now stringified as "Regexp=REGEXP(0xbe600d)" instead of the regular +expression itself [perl #108780]. + +=head2 Two XS typemap Entries removed + +Two presumably unused XS typemap entries have been removed from the +core typemap: T_DATAUNIT and T_CALLBACK. If you are, against all odds, +a user of these, please see the instructions on how to restore them +in L<perlxstypemap>. + +=head2 Unicode 6.1 has incompatibilities with Unicode 6.0 + +These are detailed in L</Supports (almost) Unicode 6.1> above. +You can compile this version of Perl to use Unicode 6.0. See +L<perlunicode/Hacking Perl to work on earlier Unicode versions (for very serious hackers only)>. + +=head2 Borland compiler + +All support for the Borland compiler has been dropped. The code had not +worked for a long time anyway. + +=head2 Certain deprecated Unicode properties are no longer supported by default + +Perl should never have exposed certain Unicode properties that are used +by Unicode internally and not meant to be publicly available. Use of +these has generated deprecated warning messages since Perl 5.12. The +removed properties are Other_Alphabetic, +Other_Default_Ignorable_Code_Point, Other_Grapheme_Extend, +Other_ID_Continue, Other_ID_Start, Other_Lowercase, Other_Math, and +Other_Uppercase. + +Perl may be recompiled to include any or all of them; instructions are +given in +L<perluniprops/Unicode character properties that are NOT accepted by Perl>. + +=head2 Dereferencing IO thingies as typeglobs + +The C<*{...}> operator, when passed a reference to an IO thingy (as in +C<*{*STDIN{IO}}>), creates a new typeglob containing just that IO object. +Previously, it would stringify as an empty string, but some operators would +treat it as undefined, producing an "uninitialized" warning. +Now it stringifies as __ANONIO__ [perl #96326]. + +=head2 User-defined case-changing operations + +This feature was deprecated in Perl 5.14, and has now been removed. +The CPAN module L<Unicode::Casing> provides better functionality without +the drawbacks that this feature had, as are detailed in the 5.14 +documentation: +L<http://perldoc.perl.org/5.14.0/perlunicode.html#User-Defined-Case-Mappings-%28for-serious-hackers-only%29> + +=head2 XSUBs are now 'static' + +XSUB C functions are now 'static', that is, they are not visible from +outside the compilation unit. Users can use the new C<XS_EXTERNAL(name)> +and C<XS_INTERNAL(name)> macros to pick the desired linking behavior. +The ordinary C<XS(name)> declaration for XSUBs will continue to declare +non-'static' XSUBs for compatibility, but the XS compiler, +L<ExtUtils::ParseXS> (C<xsubpp>) will emit 'static' XSUBs by default. +L<ExtUtils::ParseXS>'s behavior can be reconfigured from XS using the +C<EXPORT_XSUB_SYMBOLS> keyword. See L<perlxs> for details. + +=head2 Weakening read-only references + +Weakening read-only references is no longer permitted. It should never +have worked anyway, and could sometimes result in crashes. + +=head2 Tying scalars that hold typeglobs + +Attempting to tie a scalar after a typeglob was assigned to it would +instead tie the handle in the typeglob's IO slot. This meant that it was +impossible to tie the scalar itself. Similar problems affected C<tied> and +C<untie>: C<tied $scalar> would return false on a tied scalar if the last +thing returned was a typeglob, and C<untie $scalar> on such a tied scalar +would do nothing. + +We fixed this problem before Perl 5.14.0, but it caused problems with some +CPAN modules, so we put in a deprecation cycle instead. + +Now the deprecation has been removed and this bug has been fixed. So +C<tie $scalar> will always tie the scalar, not the handle it holds. To tie +the handle, use C<tie *$scalar> (with an explicit asterisk). The same +applies to C<tied *$scalar> and C<untie *$scalar>. + +=head2 IPC::Open3 no longer provides C<xfork()>, C<xclose_on_exec()> +and C<xpipe_anon()> + +All three functions were private, undocumented, and unexported. They do +not appear to be used by any code on CPAN. Two have been inlined and one +deleted entirely. + +=head2 C<$$> no longer caches PID + +Previously, if one called fork(3) from C, Perl's +notion of C<$$> could go out of sync with what getpid() returns. By always +fetching the value of C<$$> via getpid(), this potential bug is eliminated. +Code that depends on the caching behavior will break. As described in +L<Core Enhancements|/C<$$> can be assigned to>, +C<$$> is now writable, but it will be reset during a +fork. + +=head2 C<$$> and C<getppid()> no longer emulate POSIX semantics under LinuxThreads + +The POSIX emulation of C<$$> and C<getppid()> under the obsolete +LinuxThreads implementation has been removed. +This only impacts users of Linux 2.4 and +users of Debian GNU/kFreeBSD up to and including 6.0, not the vast +majority of Linux installations that use NPTL threads. + +This means that C<getppid()>, like C<$$>, is now always guaranteed to +return the OS's idea of the current state of the process, not perl's +cached version of it. + +See the documentation for L<$$|perlvar/$$> for details. + +=head2 C<< $< >>, C<< $> >>, C<$(> and C<$)> are no longer cached + +Similarly to the changes to C<$$> and C<getppid()>, the internal +caching of C<< $< >>, C<< $> >>, C<$(> and C<$)> has been removed. + +When we cached these values our idea of what they were would drift out +of sync with reality if someone (e.g., someone embedding perl) called +C<sete?[ug]id()> without updating C<PL_e?[ug]id>. Having to deal with +this complexity wasn't worth it given how cheap the C<gete?[ug]id()> +system call is. + +This change will break a handful of CPAN modules that use the XS-level +C<PL_uid>, C<PL_gid>, C<PL_euid> or C<PL_egid> variables. + +The fix for those breakages is to use C<PerlProc_gete?[ug]id()> to +retrieve them (e.g., C<PerlProc_getuid()>), and not to assign to +C<PL_e?[ug]id> if you change the UID/GID/EUID/EGID. There is no longer +any need to do so since perl will always retrieve the up-to-date +version of those values from the OS. + +=head2 Which Non-ASCII characters get quoted by C<quotemeta> and C<\Q> has changed + +This is unlikely to result in a real problem, as Perl does not attach +special meaning to any non-ASCII character, so it is currently +irrelevant which are quoted or not. This change fixes bug [perl #77654] and +brings Perl's behavior more into line with Unicode's recommendations. +See L<perlfunc/quotemeta>. + +=head1 Performance Enhancements + +=over + +=item * + +Improved performance for Unicode properties in regular expressions + +=for comment Can this be compacted some? -- rjbs, 2012-02-20 + +Matching a code point against a Unicode property is now done via a +binary search instead of linear. This means for example that the worst +case for a 1000 item property is 10 probes instead of 1000. This +inefficiency has been compensated for in the past by permanently storing +in a hash the results of a given probe plus the results for the adjacent +64 code points, under the theory that near-by code points are likely to +be searched for. A separate hash was used for each mention of a Unicode +property in each regular expression. Thus, C<qr/\p{foo}abc\p{foo}/> +would generate two hashes. Any probes in one instance would be unknown +to the other, and the hashes could expand separately to be quite large +if the regular expression were used on many different widely-separated +code points. +Now, however, there is just one hash shared by all instances of a given +property. This means that if C<\p{foo}> is matched against "A" in one +regular expression in a thread, the result will be known immediately to +all regular expressions, and the relentless march of using up memory is +slowed considerably. + +=item * + +Version declarations with the C<use> keyword (e.g., C<use 5.012>) are now +faster, as they enable features without loading F<feature.pm>. + +=item * + +C<local $_> is faster now, as it no longer iterates through magic that it +is not going to copy anyway. + +=item * + +Perl 5.12.0 sped up the destruction of objects whose classes define +empty C<DESTROY> methods (to prevent autoloading), by simply not +calling such empty methods. This release takes this optimization a +step further, by not calling any C<DESTROY> method that begins with a +C<return> statement. This can be useful for destructors that are only +used for debugging: + + use constant DEBUG => 1; + sub DESTROY { return unless DEBUG; ... } + +Constant-folding will reduce the first statement to C<return;> if DEBUG +is set to 0, triggering this optimization. + +=item * + +Assigning to a variable that holds a typeglob or copy-on-write scalar +is now much faster. Previously the typeglob would be stringified or +the copy-on-write scalar would be copied before being clobbered. + +=item * + +Assignment to C<substr> in void context is now more than twice its +previous speed. Instead of creating and returning a special lvalue +scalar that is then assigned to, C<substr> modifies the original string +itself. + +=item * + +C<substr> no longer calculates a value to return when called in void +context. + +=item * + +Due to changes in L<File::Glob>, Perl's C<glob> function and its C<< +<...> >> equivalent are now much faster. The splitting of the pattern +into words has been rewritten in C, resulting in speed-ups of 20% for +some cases. + +This does not affect C<glob> on VMS, as it does not use File::Glob. + +=item * + +The short-circuiting operators C<&&>, C<||>, and C<//>, when chained +(such as C<$a || $b || $c>), are now considerably faster to short-circuit, +due to reduced optree traversal. + +=item * + +The implementation of C<s///r> makes one fewer copy of the scalar's value. + +=item * + +Recursive calls to lvalue subroutines in lvalue scalar context use less +memory. + +=back + +=head1 Modules and Pragmata + +=head2 Deprecated Modules + +=over + +=item L<Version::Requirements> + +Version::Requirements is now DEPRECATED, use L<CPAN::Meta::Requirements>, +which is a drop-in replacement. It will be deleted from perl.git blead +in v5.17.0. + +=back + +=head2 New Modules and Pragmata + +=over 4 + +=item * + +L<arybase> -- this new module implements the C<$[> variable. + +=item * + +L<PerlIO::mmap> 0.010 has been added to the Perl core. + +The C<mmap> PerlIO layer is no longer implemented by perl itself, but has +been moved out into the new L<PerlIO::mmap> module. + +=back + +=head2 Updated Modules and Pragmata + +This is only an overview of selected module updates. For a complete list of +updates, run: + + $ corelist --diff 5.14.0 5.16.0 + +You can substitute your favorite version in place of 5.14.0, too. + +=over 4 + +=item * + +L<Archive::Extract> has been upgraded from version 0.48 to 0.58. + +Includes a fix for FreeBSD to only use C<unzip> if it is located in +C</usr/local/bin>, as FreeBSD 9.0 will ship with a limited C<unzip> in +C</usr/bin>. + +=item * + +L<Archive::Tar> has been upgraded from version 1.76 to 1.82. + +Adjustments to handle files >8gb (>0777777777777 octal) and a feature +to return the MD5SUM of files in the archive. + +=item * + +L<base> has been upgraded from version 2.16 to 2.18. + +C<base> no longer sets a module's C<$VERSION> to "-1" when a module it +loads does not define a C<$VERSION>. This change has been made because +"-1" is not a valid version number under the new "lax" criteria used +internally by C<UNIVERSAL::VERSION>. (See L<version> for more on "lax" +version criteria.) + +C<base> no longer internally skips loading modules it has already loaded +and instead relies on C<require> to inspect C<%INC>. This fixes a bug +when C<base> is used with code that clear C<%INC> to force a module to +be reloaded. + +=item * + +L<Carp> has been upgraded from version 1.20 to 1.26. + +It now includes last read filehandle info and puts a dot after the file +and line number, just like errors from C<die> [perl #106538]. + +=item * + +L<charnames> has been updated from version 1.18 to 1.30. + +C<charnames> can now be invoked with a new option, C<:loose>, +which is like the existing C<:full> option, but enables Unicode loose +name matching. Details are in L<charnames/LOOSE MATCHES>. + +=item * + +L<B::Deparse> has been upgraded from version 1.03 to 1.14. This fixes +numerous deparsing bugs. + +=item * + +L<CGI> has been upgraded from version 3.52 to 3.59. + +It uses the public and documented FCGI.pm API in CGI::Fast. CGI::Fast was +using an FCGI API that was deprecated and removed from documentation +more than ten years ago. Usage of this deprecated API with FCGI E<gt>= +0.70 or FCGI E<lt>= 0.73 introduces a security issue. +L<https://rt.cpan.org/Public/Bug/Display.html?id=68380> +L<http://web.nvd.nist.gov/view/vuln/detail?vulnId=CVE-2011-2766> + +Things that may break your code: + +C<url()> was fixed to return C<PATH_INFO> when it is explicitly requested +with either the C<path=E<gt>1> or C<path_info=E<gt>1> flag. + +If your code is running under mod_rewrite (or compatible) and you are +calling C<self_url()> or you are calling C<url()> and passing +C<path_info=E<gt>1>, these methods will actually be returning +C<PATH_INFO> now, as you have explicitly requested or C<self_url()> +has requested on your behalf. + +The C<PATH_INFO> has been omitted in such URLs since the issue was +introduced in the 3.12 release in December, 2005. + +This bug is so old your application may have come to depend on it or +workaround it. Check for application before upgrading to this release. + +Examples of affected method calls: + + $q->url(-absolute => 1, -query => 1, -path_info => 1); + $q->url(-path=>1); + $q->url(-full=>1,-path=>1); + $q->url(-rewrite=>1,-path=>1); + $q->self_url(); + +We no longer read from STDIN when the Content-Length is not set, +preventing requests with no Content-Length from sometimes freezing. +This is consistent with the CGI RFC 3875, and is also consistent with +CGI::Simple. However, the old behavior may have been expected by some +command-line uses of CGI.pm. + +In addition, the DELETE HTTP verb is now supported. + +=item * + +L<Compress::Zlib> has been upgraded from version 2.035 to 2.048. + +IO::Compress::Zip and IO::Uncompress::Unzip now have support for LZMA +(method 14). There is a fix for a CRC issue in IO::Compress::Unzip and +it supports Streamed Stored context now. And fixed a Zip64 issue in +IO::Compress::Zip when the content size was exactly 0xFFFFFFFF. + +=item * + +L<Digest::SHA> has been upgraded from version 5.61 to 5.71. + +Added BITS mode to the addfile method and shasum. This makes +partial-byte inputs possible via files/STDIN and lets shasum check +all 8074 NIST Msg vectors, where previously special programming was +required to do this. + +=item * + +L<Encode> has been upgraded from version 2.42 to 2.44. + +Missing aliases added, a deep recursion error fixed and various +documentation updates. + +Addressed 'decode_xs n-byte heap-overflow' security bug in Unicode.xs +(CVE-2011-2939). (5.14.2) + +=item * + +L<ExtUtils::CBuilder> updated from version 0.280203 to 0.280206. + +The new version appends CFLAGS and LDFLAGS to their Config.pm +counterparts. + +=item * + +L<ExtUtils::ParseXS> has been upgraded from version 2.2210 to 3.16. + +Much of L<ExtUtils::ParseXS>, the module behind the XS compiler C<xsubpp>, +was rewritten and cleaned up. It has been made somewhat more extensible +and now finally uses strictures. + +The typemap logic has been moved into a separate module, +L<ExtUtils::Typemaps>. See L</New Modules and Pragmata>, above. + +For a complete set of changes, please see the ExtUtils::ParseXS +changelog, available on the CPAN. + +=item * + +L<File::Glob> has been upgraded from version 1.12 to 1.17. + +On Windows, tilde (~) expansion now checks the C<USERPROFILE> environment +variable, after checking C<HOME>. + +It has a new C<:bsd_glob> export tag, intended to replace C<:glob>. Like +C<:glob> it overrides C<glob> with a function that does not split the glob +pattern into words, but, unlike C<:glob>, it iterates properly in scalar +context, instead of returning the last file. + +There are other changes affecting Perl's own C<glob> operator (which uses +File::Glob internally, except on VMS). See L</Performance Enhancements> +and L</Selected Bug Fixes>. + +=item * + +L<FindBin> updated from version 1.50 to 1.51. + +It no longer returns a wrong result if a script of the same name as the +current one exists in the path and is executable. + +=item * + +L<HTTP::Tiny> has been upgraded from version 0.012 to 0.017. + +Added support for using C<$ENV{http_proxy}> to set the default proxy host. + +Adds additional shorthand methods for all common HTTP verbs, +a C<post_form()> method for POST-ing x-www-form-urlencoded data and +a C<www_form_urlencode()> utility method. + +=item * + +L<IO> has been upgraded from version 1.25_04 to 1.25_06, and L<IO::Handle> +from version 1.31 to 1.33. + +Together, these upgrades fix a problem with IO::Handle's C<getline> and +C<getlines> methods. When these methods are called on the special ARGV +handle, the next file is automatically opened, as happens with the built-in +C<E<lt>E<gt>> and C<readline> functions. But, unlike the built-ins, these +methods were not respecting the caller's use of the L<open> pragma and +applying the appropriate I/O layers to the newly-opened file +[rt.cpan.org #66474]. + +=item * + +L<IPC::Cmd> has been upgraded from version 0.70 to 0.76. + +Capturing of command output (both C<STDOUT> and C<STDERR>) is now supported +using L<IPC::Open3> on MSWin32 without requiring L<IPC::Run>. + +=item * + +L<IPC::Open3> has been upgraded from version 1.09 to 1.12. + +Fixes a bug which prevented use of C<open3> on Windows when C<*STDIN>, +C<*STDOUT> or C<*STDERR> had been localized. + +Fixes a bug which prevented duplicating numeric file descriptors on Windows. + +C<open3> with "-" for the program name works once more. This was broken in +version 1.06 (and hence in Perl 5.14.0) [perl #95748]. + +=item * + +L<Locale::Codes> has been upgraded from version 3.16 to 3.21. + +Added Language Extension codes (langext) and Language Variation codes (langvar) +as defined in the IANA language registry. + +Added language codes from ISO 639-5 + +Added language/script codes from the IANA language subtag registry + +Fixed an uninitialized value warning [rt.cpan.org #67438]. + +Fixed the return value for the all_XXX_codes and all_XXX_names functions +[rt.cpan.org #69100]. + +Reorganized modules to move Locale::MODULE to Locale::Codes::MODULE to allow +for cleaner future additions. The original four modules (Locale::Language, +Locale::Currency, Locale::Country, Locale::Script) will continue to work, but +all new sets of codes will be added in the Locale::Codes namespace. + +The code2XXX, XXX2code, all_XXX_codes, and all_XXX_names functions now +support retired codes. All codesets may be specified by a constant or +by their name now. Previously, they were specified only by a constant. + +The alias_code function exists for backward compatibility. It has been +replaced by rename_country_code. The alias_code function will be +removed some time after September, 2013. + +All work is now done in the central module (Locale::Codes). Previously, +some was still done in the wrapper modules (Locale::Codes::*). Added +Language Family codes (langfam) as defined in ISO 639-5. + +=item * + +L<Math::BigFloat> has been upgraded from version 1.993 to 1.997. + +The C<numify> method has been corrected to return a normalized Perl number +(the result of C<0 + $thing>), instead of a string [rt.cpan.org #66732]. + +=item * + +L<Math::BigInt> has been upgraded from version 1.994 to 1.998. + +It provides a new C<bsgn> method that complements the C<babs> method. + +It fixes the internal C<objectify> function's handling of "foreign objects" +so they are converted to the appropriate class (Math::BigInt or +Math::BigFloat). + +=item * + +L<Math::BigRat> has been upgraded from version 0.2602 to 0.2603. + +C<int()> on a Math::BigRat object containing -1/2 now creates a +Math::BigInt containing 0, rather than -0. L<Math::BigInt> does not even +support negative zero, so the resulting object was actually malformed +[perl #95530]. + +=item * + +L<Math::Complex> has been upgraded from version 1.56 to 1.59 +and L<Math::Trig> from version 1.2 to 1.22. + +Fixes include: correct copy constructor usage; fix polarwise formatting with +numeric format specifier; and more stable C<great_circle_direction> algorithm. + +=item * + +L<Module::CoreList> has been upgraded from version 2.51 to 2.66. + +The C<corelist> utility now understands the C<-r> option for displaying +Perl release dates and the C<--diff> option to print the set of modlib +changes between two perl distributions. + +=item * + +L<Module::Metadata> has been upgraded from version 1.000004 to 1.000009. + +Adds C<provides> method to generate a CPAN META provides data structure +correctly; use of C<package_versions_from_directory> is discouraged. + +=item * + +L<ODBM_File> has been upgraded from version 1.10 to 1.12. + +The XS code is now compiled with C<PERL_NO_GET_CONTEXT>, which will aid +performance under ithreads. + +=item * + +L<open> has been upgraded from version 1.08 to 1.10. + +It no longer turns off layers on standard handles when invoked without the +":std" directive. Similarly, when invoked I<with> the ":std" directive, it +now clears layers on STDERR before applying the new ones, and not just on +STDIN and STDOUT [perl #92728]. + +=item * + +L<overload> has been upgraded from version 1.13 to 1.18. + +C<overload::Overloaded> no longer calls C<can> on the class, but uses +another means to determine whether the object has overloading. It was +never correct for it to call C<can>, as overloading does not respect +AUTOLOAD. So classes that autoload methods and implement C<can> no longer +have to account for overloading [perl #40333]. + +A warning is now produced for invalid arguments. See L</New Diagnostics>. + +=item * + +L<PerlIO::scalar> has been upgraded from version 0.11 to 0.14. + +(This is the module that implements C<< open $fh, '>', \$scalar >>.) + +It fixes a problem with C<< open my $fh, ">", \$scalar >> not working if +C<$scalar> is a copy-on-write scalar. (5.14.2) + +It also fixes a hang that occurs with C<readline> or C<< <$fh> >> if a +typeglob has been assigned to $scalar [perl #92258]. + +It no longer assumes during C<seek> that $scalar is a string internally. +If it didn't crash, it was close to doing so [perl #92706]. Also, the +internal print routine no longer assumes that the position set by C<seek> +is valid, but extends the string to that position, filling the intervening +bytes (between the old length and the seek position) with nulls +[perl #78980]. + +Printing to an in-memory handle now works if the $scalar holds a reference, +stringifying the reference before modifying it. References used to be +treated as empty strings. + +Printing to an in-memory handle no longer crashes if the $scalar happens to +hold a number internally, but no string buffer. + +Printing to an in-memory handle no longer creates scalars that confuse +the regular expression engine [perl #108398]. + +=item * + +L<Pod::Functions> has been upgraded from version 1.04 to 1.05. + +F<Functions.pm> is now generated at perl build time from annotations in +F<perlfunc.pod>. This will ensure that L<Pod::Functions> and L<perlfunc> +remain in synchronisation. + +=item * + +L<Pod::Html> has been upgraded from version 1.11 to 1.1502. + +This is an extensive rewrite of Pod::Html to use L<Pod::Simple> under +the hood. The output has changed significantly. + +=item * + +L<Pod::Perldoc> has been upgraded from version 3.15_03 to 3.17. + +It corrects the search paths on VMS [perl #90640]. (5.14.1) + +The B<-v> option now fetches the right section for C<$0>. + +This upgrade has numerous significant fixes. Consult its changelog on +the CPAN for more information. + +=item * + +L<POSIX> has been upgraded from version 1.24 to 1.30. + +L<POSIX> no longer uses L<AutoLoader>. Any code which was relying on this +implementation detail was buggy, and may fail because of this change. +The module's Perl code has been considerably simplified, roughly halving +the number of lines, with no change in functionality. The XS code has +been refactored to reduce the size of the shared object by about 12%, +with no change in functionality. More POSIX functions now have tests. + +C<sigsuspend> and C<pause> now run signal handlers before returning, as the +whole point of these two functions is to wait until a signal has +arrived, and then return I<after> it has been triggered. Delayed, or +"safe", signals were preventing that from happening, possibly resulting in +race conditions [perl #107216]. + +C<POSIX::sleep> is now a direct call into the underlying OS C<sleep> +function, instead of being a Perl wrapper on C<CORE::sleep>. +C<POSIX::dup2> now returns the correct value on Win32 (I<i.e.>, the file +descriptor). C<POSIX::SigSet> C<sigsuspend> and C<sigpending> and +C<POSIX::pause> now dispatch safe signals immediately before returning to +their caller. + +C<POSIX::Termios::setattr> now defaults the third argument to C<TCSANOW>, +instead of 0. On most platforms C<TCSANOW> is defined to be 0, but on some +0 is not a valid parameter, which caused a call with defaults to fail. + +=item * + +L<Socket> has been upgraded from version 1.94 to 2.001. + +It has new functions and constants for handling IPv6 sockets: + + pack_ipv6_mreq + unpack_ipv6_mreq + IPV6_ADD_MEMBERSHIP + IPV6_DROP_MEMBERSHIP + IPV6_MTU + IPV6_MTU_DISCOVER + IPV6_MULTICAST_HOPS + IPV6_MULTICAST_IF + IPV6_MULTICAST_LOOP + IPV6_UNICAST_HOPS + IPV6_V6ONLY + +=item * + +L<Storable> has been upgraded from version 2.27 to 2.34. + +It no longer turns copy-on-write scalars into read-only scalars when +freezing and thawing. + +=item * + +L<Sys::Syslog> has been upgraded from version 0.27 to 0.29. + +This upgrade closes many outstanding bugs. + +=item * + +L<Term::ANSIColor> has been upgraded from version 3.00 to 3.01. + +Only interpret an initial array reference as a list of colors, not any initial +reference, allowing the colored function to work properly on objects with +stringification defined. + +=item * + +L<Term::ReadLine> has been upgraded from version 1.07 to 1.09. + +Term::ReadLine now supports any event loop, including unpublished ones and +simple L<IO::Select>, loops without the need to rewrite existing code for +any particular framework [perl #108470]. + +=item * + +L<threads::shared> has been upgraded from version 1.37 to 1.40. + +Destructors on shared objects used to be ignored sometimes if the objects +were referenced only by shared data structures. This has been mostly +fixed, but destructors may still be ignored if the objects still exist at +global destruction time [perl #98204]. + +=item * + +L<Unicode::Collate> has been upgraded from version 0.73 to 0.89. + +Updated to CLDR 1.9.1 + +Locales updated to CLDR 2.0: mk, mt, nb, nn, ro, ru, sk, sr, sv, uk, +zh__pinyin, zh__stroke + +Newly supported locales: bn, fa, ml, mr, or, pa, sa, si, si__dictionary, +sr_Latn, sv__reformed, ta, te, th, ur, wae. + +Tailored compatibility ideographs as well as unified ideographs for the +locales: ja, ko, zh__big5han, zh__gb2312han, zh__pinyin, zh__stroke. + +Locale/*.pl files are now searched for in @INC. + +=item * + +L<Unicode::Normalize> has been upgraded from version 1.10 to 1.14. + +Fixes for the removal of F<unicore/CompositionExclusions.txt> from core. + +=item * + +L<Unicode::UCD> has been upgraded from version 0.32 to 0.43. + +This adds four new functions: C<prop_aliases()> and +C<prop_value_aliases()>, which are used to find all Unicode-approved +synonyms for property names, or to convert from one name to another; +C<prop_invlist> which returns all code points matching a given +Unicode binary property; and C<prop_invmap> which returns the complete +specification of a given Unicode property. + +=item * + +L<Win32API::File> has been upgraded from version 0.1101 to 0.1200. + +Added SetStdHandle and GetStdHandle functions + +=back + +=head2 Removed Modules and Pragmata + +As promised in Perl 5.14.0's release notes, the following modules have +been removed from the core distribution, and if needed should be installed +from CPAN instead. + +=over + +=item * + +L<Devel::DProf> has been removed from the Perl core. Prior version was +20110228.00. + +=item * + +L<Shell> has been removed from the Perl core. Prior version was 0.72_01. + +=item * + +Several old perl4-style libraries which have been deprecated with 5.14 +are now removed: + + abbrev.pl assert.pl bigfloat.pl bigint.pl bigrat.pl cacheout.pl + complete.pl ctime.pl dotsh.pl exceptions.pl fastcwd.pl flush.pl + getcwd.pl getopt.pl getopts.pl hostname.pl importenv.pl + lib/find{,depth}.pl look.pl newgetopt.pl open2.pl open3.pl + pwd.pl shellwords.pl stat.pl tainted.pl termcap.pl timelocal.pl + +They can be found on CPAN as L<Perl4::CoreLibs>. + +=back + +=head1 Documentation + +=head2 New Documentation + +=head3 L<perldtrace> + +L<perldtrace> describes Perl's DTrace support, listing the provided probes +and gives examples of their use. + +=head3 L<perlexperiment> + +This document is intended to provide a list of experimental features in +Perl. It is still a work in progress. + +=head3 L<perlootut> + +This a new OO tutorial. It focuses on basic OO concepts, and then recommends +that readers choose an OO framework from CPAN. + +=head3 L<perlxstypemap> + +The new manual describes the XS typemapping mechanism in unprecedented +detail and combines new documentation with information extracted from +L<perlxs> and the previously unofficial list of all core typemaps. + +=head2 Changes to Existing Documentation + +=head3 L<perlapi> + +=over 4 + +=item * + +The HV API has long accepted negative lengths to show that the key is +in UTF8. This is now documented. + +=item * + +The C<boolSV()> macro is now documented. + +=back + +=head3 L<perlfunc> + +=over 4 + +=item * + +C<dbmopen> treats a 0 mode as a special case, that prevents a nonexistent +file from being created. This has been the case since Perl 5.000, but was +never documented anywhere. Now the perlfunc entry mentions it +[perl #90064]. + +=item * + +As an accident of history, C<open $fh, '<:', ...> applies the default +layers for the platform (C<:raw> on Unix, C<:crlf> on Windows), ignoring +whatever is declared by L<open.pm|open>. This seems such a useful feature +it has been documented in L<perlfunc|perlfunc/open> and L<open>. + +=item * + +The entry for C<split> has been rewritten. It is now far clearer than +before. + +=back + +=head3 L<perlguts> + +=over 4 + +=item * + +A new section, L<Autoloading with XSUBs|perlguts/Autoloading with XSUBs>, +has been added, which explains the two APIs for accessing the name of the +autoloaded sub. + +=item * + +Some function descriptions in L<perlguts> were confusing, as it was +not clear whether they referred to the function above or below the +description. This has been clarified [perl #91790]. + +=back + +=head3 L<perlobj> + +=over 4 + +=item * + +This document has been rewritten from scratch, and its coverage of various OO +concepts has been expanded. + +=back + +=head3 L<perlop> + +=over 4 + +=item * + +Documentation of the smartmatch operator has been reworked and moved from +perlsyn to perlop where it belongs. + +It has also been corrected for the case of C<undef> on the left-hand +side. The list of different smart match behaviors had an item in the +wrong place. + +=item * + +Documentation of the ellipsis statement (C<...>) has been reworked and +moved from perlop to perlsyn. + +=item * + +The explanation of bitwise operators has been expanded to explain how they +work on Unicode strings (5.14.1). + +=item * + +More examples for C<m//g> have been added (5.14.1). + +=item * + +The C<<< <<\FOO >>> here-doc syntax has been documented (5.14.1). + +=back + +=head3 L<perlpragma> + +=over 4 + +=item * + +There is now a standard convention for naming keys in the C<%^H>, +documented under L<Key naming|perlpragma/Key naming>. + +=back + +=head3 L<perlsec/Laundering and Detecting Tainted Data> + +=over 4 + +=item * + +The example function for checking for taintedness contained a subtle +error. C<$@> needs to be localized to prevent its changing this +global's value outside the function. The preferred method to check for +this remains L<Scalar::Util/tainted>. + +=back + +=head3 L<perllol> + +=over + +=item * + +L<perllol> has been expanded with examples using the new C<push $scalar> +syntax introduced in Perl 5.14.0 (5.14.1). + +=back + +=head3 L<perlmod> + +=over + +=item * + +L<perlmod> now states explicitly that some types of explicit symbol table +manipulation are not supported. This codifies what was effectively already +the case [perl #78074]. + +=back + +=head3 L<perlpodstyle> + +=over 4 + +=item * + +The tips on which formatting codes to use have been corrected and greatly +expanded. + +=item * + +There are now a couple of example one-liners for previewing POD files after +they have been edited. + +=back + +=head3 L<perlre> + +=over + +=item * + +The C<(*COMMIT)> directive is now listed in the right section +(L<Verbs without an argument|perlre/Verbs without an argument>). + +=back + +=head3 L<perlrun> + +=over + +=item * + +L<perlrun> has undergone a significant clean-up. Most notably, the +B<-0x...> form of the B<-0> flag has been clarified, and the final section +on environment variables has been corrected and expanded (5.14.1). + +=back + +=head3 L<perlsub> + +=over + +=item * + +The ($;) prototype syntax, which has existed for rather a long time, is now +documented in L<perlsub>. It lets a unary function have the same +precedence as a list operator. + +=back + +=head3 L<perltie> + +=over + +=item * + +The required syntax for tying handles has been documented. + +=back + +=head3 L<perlvar> + +=over + +=item * + +The documentation for L<$!|perlvar/$!> has been corrected and clarified. +It used to state that $! could be C<undef>, which is not the case. It was +also unclear whether system calls set C's C<errno> or Perl's C<$!> +[perl #91614]. + +=item * + +Documentation for L<$$|perlvar/$$> has been amended with additional +cautions regarding changing the process ID. + +=back + +=head3 Other Changes + +=over 4 + +=item * + +L<perlxs> was extended with documentation on inline typemaps. + +=item * + +L<perlref> has a new L<Circular References|perlref/Circular References> +section explaining how circularities may not be freed and how to solve that +with weak references. + +=item * + +Parts of L<perlapi> were clarified, and Perl equivalents of some C +functions have been added as an additional mode of exposition. + +=item * + +A few parts of L<perlre> and L<perlrecharclass> were clarified. + +=back + +=head2 Removed Documentation + +=head3 Old OO Documentation + +The old OO tutorials, perltoot, perltooc, and perlboot, have been +removed. The perlbot (bag of object tricks) document has been removed +as well. + +=head3 Development Deltas + +The perldelta files for development releases are no longer packaged with +perl. These can still be found in the perl source code repository. + +=head1 Diagnostics + +The following additions or changes have been made to diagnostic output, +including warnings and fatal error messages. For the complete list of +diagnostic messages, see L<perldiag>. + +=head2 New Diagnostics + +=head3 New Errors + +=over 4 + +=item * + +L<Cannot set tied @DB::args|perldiag/"Cannot set tied @DB::args"> + +This error occurs when C<caller> tries to set C<@DB::args> but finds it +tied. Before this error was added, it used to crash instead. + +=item * + +L<Cannot tie unreifiable array|perldiag/"Cannot tie unreifiable array"> + +This error is part of a safety check that the C<tie> operator does before +tying a special array like C<@_>. You should never see this message. + +=item * + +L<&CORE::%s cannot be called directly|perldiag/"&CORE::%s cannot be called directly"> + +This occurs when a subroutine in the C<CORE::> namespace is called +with C<&foo> syntax or through a reference. Some subroutines +in this package cannot yet be called that way, but must be +called as barewords. See L</Subroutines in the C<CORE> namespace>, above. + +=item * + +L<Source filters apply only to byte streams|perldiag/"Source filters apply only to byte streams"> + +This new error occurs when you try to activate a source filter (usually by +loading a source filter module) within a string passed to C<eval> under the +C<unicode_eval> feature. + +=back + +=head3 New Warnings + +=over 4 + +=item * + +L<defined(@array) is deprecated|perldiag/"defined(@array) is deprecated"> + +The long-deprecated C<defined(@array)> now also warns for package variables. +Previously it issued a warning for lexical variables only. + +=item * + +L<length() used on %s|perldiag/length() used on %s> + +This new warning occurs when C<length> is used on an array or hash, instead +of C<scalar(@array)> or C<scalar(keys %hash)>. + +=item * + +L<lvalue attribute %s already-defined subroutine|perldiag/"lvalue attribute %s already-defined subroutine"> + +L<attributes.pm|attributes> now emits this warning when the :lvalue +attribute is applied to a Perl subroutine that has already been defined, as +doing so can have unexpected side-effects. + +=item * + +L<overload arg '%s' is invalid|perldiag/"overload arg '%s' is invalid"> + +This warning, in the "overload" category, is produced when the overload +pragma is given an argument it doesn't recognize, presumably a mistyped +operator. + +=item * + +L<$[ used in %s (did you mean $] ?)|perldiag/"$[ used in %s (did you mean $] ?)"> + +This new warning exists to catch the mistaken use of C<$[> in version +checks. C<$]>, not C<$[>, contains the version number. + +=item * + +L<Useless assignment to a temporary|perldiag/"Useless assignment to a temporary"> + +Assigning to a temporary scalar returned +from an lvalue subroutine now produces this +warning [perl #31946]. + +=item * + +L<Useless use of \E|perldiag/"Useless use of \E"> + +C<\E> does nothing unless preceded by C<\Q>, C<\L> or C<\U>. + +=back + +=head2 Removed Errors + +=over + +=item * + +"sort is now a reserved word" + +This error used to occur when C<sort> was called without arguments, +followed by C<;> or C<)>. (E.g., C<sort;> would die, but C<{sort}> was +OK.) This error message was added in Perl 3 to catch code like +C<close(sort)> which would no longer work. More than two decades later, +this message is no longer appropriate. Now C<sort> without arguments is +always allowed, and returns an empty list, as it did in those cases +where it was already allowed [perl #90030]. + +=back + +=head2 Changes to Existing Diagnostics + +=over 4 + +=item * + +The "Applying pattern match..." or similar warning produced when an +array or hash is on the left-hand side of the C<=~> operator now +mentions the name of the variable. + +=item * + +The "Attempt to free non-existent shared string" has had the spelling +of "non-existent" corrected to "nonexistent". It was already listed +with the correct spelling in L<perldiag>. + +=item * + +The error messages for using C<default> and C<when> outside a +topicalizer have been standardized to match the messages for C<continue> +and loop controls. They now read 'Can't "default" outside a +topicalizer' and 'Can't "when" outside a topicalizer'. They both used +to be 'Can't use when() outside a topicalizer' [perl #91514]. + +=item * + +The message, "Code point 0x%X is not Unicode, no properties match it; +all inverse properties do" has been changed to "Code point 0x%X is not +Unicode, all \p{} matches fail; all \P{} matches succeed". + +=item * + +Redefinition warnings for constant subroutines used to be mandatory, +even occurring under C<no warnings>. Now they respect the L<warnings> +pragma. + +=item * + +The "glob failed" warning message is now suppressible via C<no warnings> +[perl #111656]. + +=item * + +The L<Invalid version format|perldiag/"Invalid version format (%s)"> +error message now says "negative version number" within the parentheses, +rather than "non-numeric data", for negative numbers. + +=item * + +The two warnings +L<Possible attempt to put comments in qw() list|perldiag/"Possible attempt to put comments in qw() list"> +and +L<Possible attempt to separate words with commas|perldiag/"Possible attempt to separate words with commas"> +are no longer mutually exclusive: the same C<qw> construct may produce +both. + +=item * + +The uninitialized warning for C<y///r> when C<$_> is implicit and +undefined now mentions the variable name, just like the non-/r variation +of the operator. + +=item * + +The 'Use of "foo" without parentheses is ambiguous' warning has been +extended to apply also to user-defined subroutines with a (;$) +prototype, and not just to built-in functions. + +=item * + +Warnings that mention the names of lexical (C<my>) variables with +Unicode characters in them now respect the presence or absence of the +C<:utf8> layer on the output handle, instead of outputting UTF8 +regardless. Also, the correct names are included in the strings passed +to C<$SIG{__WARN__}> handlers, rather than the raw UTF8 bytes. + +=back + +=head1 Utility Changes + +=head3 L<h2ph> + +=over 4 + +=item * + +L<h2ph> used to generate code of the form + + unless(defined(&FOO)) { + sub FOO () {42;} + } + +But the subroutine is a compile-time declaration, and is hence unaffected +by the condition. It has now been corrected to emit a string C<eval> +around the subroutine [perl #99368]. + +=back + +=head3 L<splain> + +=over 4 + +=item * + +F<splain> no longer emits backtraces with the first line number repeated. + +This: + + Uncaught exception from user code: + Cannot fwiddle the fwuddle at -e line 1. + at -e line 1 + main::baz() called at -e line 1 + main::bar() called at -e line 1 + main::foo() called at -e line 1 + +has become this: + + Uncaught exception from user code: + Cannot fwiddle the fwuddle at -e line 1. + main::baz() called at -e line 1 + main::bar() called at -e line 1 + main::foo() called at -e line 1 + +=item * + +Some error messages consist of multiple lines that are listed as separate +entries in L<perldiag>. splain has been taught to find the separate +entries in these cases, instead of simply failing to find the message. + +=back + +=head3 L<zipdetails> + +=over 4 + +=item * + +This is a new utility, included as part of an +L<IO::Compress::Base> upgrade. + +L<zipdetails> displays information about the internal record structure +of the zip file. It is not concerned with displaying any details of +the compressed data stored in the zip file. + +=back + +=head1 Configuration and Compilation + +=over 4 + +=item * + +F<regexp.h> has been modified for compatibility with GCC's B<-Werror> +option, as used by some projects that include perl's header files (5.14.1). + +=item * + +C<USE_LOCALE{,_COLLATE,_CTYPE,_NUMERIC}> have been added the output of perl -V +as they have affect the behavior of the interpreter binary (albeit +in only a small area). + +=item * + +The code and tests for L<IPC::Open2> have been moved from F<ext/IPC-Open2> +into F<ext/IPC-Open3>, as C<IPC::Open2::open2()> is implemented as a thin +wrapper around C<IPC::Open3::_open3()>, and hence is very tightly coupled to +it. + +=item * + +The magic types and magic vtables are now generated from data in a new script +F<regen/mg_vtable.pl>, instead of being maintained by hand. As different +EBCDIC variants can't agree on the code point for '~', the character to code +point conversion is done at build time by F<generate_uudmap> to a new generated +header F<mg_data.h>. C<PL_vtbl_bm> and C<PL_vtbl_fm> are now defined by the +pre-processor as C<PL_vtbl_regexp>, instead of being distinct C variables. +C<PL_vtbl_sig> has been removed. + +=item * + +Building with C<-DPERL_GLOBAL_STRUCT> works again. This configuration is not +generally used. + +=item * + +Perl configured with I<MAD> now correctly frees C<MADPROP> structures when +OPs are freed. C<MADPROP>s are now allocated with C<PerlMemShared_malloc()> + +=item * + +F<makedef.pl> has been refactored. This should have no noticeable affect on +any of the platforms that use it as part of their build (AIX, VMS, Win32). + +=item * + +C<useperlio> can no longer be disabled. + +=item * + +The file F<global.sym> is no longer needed, and has been removed. It +contained a list of all exported functions, one of the files generated by +F<regen/embed.pl> from data in F<embed.fnc> and F<regen/opcodes>. The code +has been refactored so that the only user of F<global.sym>, F<makedef.pl>, +now reads F<embed.fnc> and F<regen/opcodes> directly, removing the need to +store the list of exported functions in an intermediate file. + +As F<global.sym> was never installed, this change should not be visible +outside the build process. + +=item * + +F<pod/buildtoc>, used by the build process to build L<perltoc>, has been +refactored and simplified. It now contains only code to build L<perltoc>; +the code to regenerate Makefiles has been moved to F<Porting/pod_rules.pl>. +It's a bug if this change has any material effect on the build process. + +=item * + +F<pod/roffitall> is now built by F<pod/buildtoc>, instead of being +shipped with the distribution. Its list of manpages is now generated +(and therefore current). See also RT #103202 for an unresolved related +issue. + +=item * + +The man page for C<XS::Typemap> is no longer installed. C<XS::Typemap> +is a test module which is not installed, hence installing its +documentation makes no sense. + +=item * + +The -Dusesitecustomize and -Duserelocatableinc options now work +together properly. + +=back + +=head1 Platform Support + +=head2 Platform-Specific Notes + +=head3 Cygwin + +=over 4 + +=item * + +Since version 1.7, Cygwin supports native UTF-8 paths. If Perl is built +under that environment, directory and filenames will be UTF-8 encoded. + +=item * + +Cygwin does not initialize all original Win32 environment variables. See +F<README.cygwin> for a discussion of the newly-added +C<Cygwin::sync_winenv()> function [perl #110190] and for +further links. + +=back + +=head3 HP-UX + +=over 4 + +=item * + +HP-UX PA-RISC/64 now supports gcc-4.x + +A fix to correct the socketsize now makes the test suite pass on HP-UX +PA-RISC for 64bitall builds. (5.14.2) + +=back + +=head3 VMS + +=over 4 + +=item * + +Remove unnecessary includes, fix miscellaneous compiler warnings and +close some unclosed comments on F<vms/vms.c>. + +=item * + +Remove sockadapt layer from the VMS build. + +=item * + +Explicit support for VMS versions before v7.0 and DEC C versions +before v6.0 has been removed. + +=item * + +Since Perl 5.10.1, the home-grown C<stat> wrapper has been unable to +distinguish between a directory name containing an underscore and an +otherwise-identical filename containing a dot in the same position +(e.g., t/test_pl as a directory and t/test.pl as a file). This problem +has been corrected. + +=item * + +The build on VMS now permits names of the resulting symbols in C code for +Perl longer than 31 characters. Symbols like +C<Perl__it_was_the_best_of_times_it_was_the_worst_of_times> can now be +created freely without causing the VMS linker to seize up. + +=back + +=head3 GNU/Hurd + +=over 4 + +=item * + +Numerous build and test failures on GNU/Hurd have been resolved with hints +for building DBM modules, detection of the library search path, and enabling +of large file support. + +=back + +=head3 OpenVOS + +=over 4 + +=item * + +Perl is now built with dynamic linking on OpenVOS, the minimum supported +version of which is now Release 17.1.0. + +=back + +=head3 SunOS + +The CC workshop C++ compiler is now detected and used on systems that ship +without cc. + +=head1 Internal Changes + +=over 4 + +=item * + +The compiled representation of formats is now stored via the C<mg_ptr> of +their C<PERL_MAGIC_fm>. Previously it was stored in the string buffer, +beyond C<SvLEN()>, the regular end of the string. C<SvCOMPILED()> and +C<SvCOMPILED_{on,off}()> now exist solely for compatibility for XS code. +The first is always 0, the other two now no-ops. (5.14.1) + +=item * + +Some global variables have been marked C<const>, members in the interpreter +structure have been re-ordered, and the opcodes have been re-ordered. The +op C<OP_AELEMFAST> has been split into C<OP_AELEMFAST> and C<OP_AELEMFAST_LEX>. + +=item * + +When empting a hash of its elements (e.g., via undef(%h), or %h=()), HvARRAY +field is no longer temporarily zeroed. Any destructors called on the freed +elements see the remaining elements. Thus, %h=() becomes more like +C<delete $h{$_} for keys %h>. + +=item * + +Boyer-Moore compiled scalars are now PVMGs, and the Boyer-Moore tables are now +stored via the mg_ptr of their C<PERL_MAGIC_bm>. +Previously they were PVGVs, with the tables stored in +the string buffer, beyond C<SvLEN()>. This eliminates +the last place where the core stores data beyond C<SvLEN()>. + +=item * + +Simplified logic in C<Perl_sv_magic()> introduces a small change of +behavior for error cases involving unknown magic types. Previously, if +C<Perl_sv_magic()> was passed a magic type unknown to it, it would + +=over + +=item 1. + +Croak "Modification of a read-only value attempted" if read only + +=item 2. + +Return without error if the SV happened to already have this magic + +=item 3. + +otherwise croak "Don't know how to handle magic of type \\%o" + +=back + +Now it will always croak "Don't know how to handle magic of type \\%o", even +on read-only values, or SVs which already have the unknown magic type. + +=item * + +The experimental C<fetch_cop_label> function has been renamed to +C<cop_fetch_label>. + +=item * + +The C<cop_store_label> function has been added to the API, but is +experimental. + +=item * + +F<embedvar.h> has been simplified, and one level of macro indirection for +PL_* variables has been removed for the default (non-multiplicity) +configuration. PERLVAR*() macros now directly expand their arguments to +tokens such as C<PL_defgv>, instead of expanding to C<PL_Idefgv>, with +F<embedvar.h> defining a macro to map C<PL_Idefgv> to C<PL_defgv>. XS code +which has unwarranted chumminess with the implementation may need updating. + +=item * + +An API has been added to explicitly choose whether to export XSUB +symbols. More detail can be found in the comments for commit e64345f8. + +=item * + +The C<is_gv_magical_sv> function has been eliminated and merged with +C<gv_fetchpvn_flags>. It used to be called to determine whether a GV +should be autovivified in rvalue context. Now it has been replaced with a +new C<GV_ADDMG> flag (not part of the API). + +=item * + +The returned code point from the function C<utf8n_to_uvuni()> +when the input is malformed UTF-8, malformations are allowed, and +C<utf8> warnings are off is now the Unicode REPLACEMENT CHARACTER +whenever the malformation is such that no well-defined code point can be +computed. Previously the returned value was essentially garbage. The +only malformations that have well-defined values are a zero-length +string (0 is the return), and overlong UTF-8 sequences. + +=item * + +Padlists are now marked C<AvREAL>; i.e., reference-counted. They have +always been reference-counted, but were not marked real, because F<pad.c> +did its own clean-up, instead of using the usual clean-up code in F<sv.c>. +That caused problems in thread cloning, so now the C<AvREAL> flag is on, +but is turned off in F<pad.c> right before the padlist is freed (after +F<pad.c> has done its custom freeing of the pads). + +=item * + +All C files that make up the Perl core have been converted to UTF-8. + +=item * + +These new functions have been added as part of the work on Unicode symbols: + + HvNAMELEN + HvNAMEUTF8 + HvENAMELEN + HvENAMEUTF8 + gv_init_pv + gv_init_pvn + gv_init_pvsv + gv_fetchmeth_pv + gv_fetchmeth_pvn + gv_fetchmeth_sv + gv_fetchmeth_pv_autoload + gv_fetchmeth_pvn_autoload + gv_fetchmeth_sv_autoload + gv_fetchmethod_pv_flags + gv_fetchmethod_pvn_flags + gv_fetchmethod_sv_flags + gv_autoload_pv + gv_autoload_pvn + gv_autoload_sv + newGVgen_flags + sv_derived_from_pv + sv_derived_from_pvn + sv_derived_from_sv + sv_does_pv + sv_does_pvn + sv_does_sv + whichsig_pv + whichsig_pvn + whichsig_sv + newCONSTSUB_flags + +The gv_fetchmethod_*_flags functions, like gv_fetchmethod_flags, are +experimental and may change in a future release. + +=item * + +The following functions were added. These are I<not> part of the API: + + GvNAMEUTF8 + GvENAMELEN + GvENAME_HEK + CopSTASH_flags + CopSTASH_flags_set + PmopSTASH_flags + PmopSTASH_flags_set + sv_sethek + HEKfARG + +There is also a C<HEKf> macro corresponding to C<SVf>, for +interpolating HEKs in formatted strings. + +=item * + +C<sv_catpvn_flags> takes a couple of new internal-only flags, +C<SV_CATBYTES> and C<SV_CATUTF8>, which tell it whether the char array to +be concatenated is UTF8. This allows for more efficient concatenation than +creating temporary SVs to pass to C<sv_catsv>. + +=item * + +For XS AUTOLOAD subs, $AUTOLOAD is set once more, as it was in 5.6.0. This +is in addition to setting C<SvPVX(cv)>, for compatibility with 5.8 to 5.14. +See L<perlguts/Autoloading with XSUBs>. + +=item * + +Perl now checks whether the array (the linearized isa) returned by a MRO +plugin begins with the name of the class itself, for which the array was +created, instead of assuming that it does. This prevents the first element +from being skipped during method lookup. It also means that +C<mro::get_linear_isa> may return an array with one more element than the +MRO plugin provided [perl #94306]. + +=item * + +C<PL_curstash> is now reference-counted. + +=item * + +There are now feature bundle hints in C<PL_hints> (C<$^H>) that version +declarations use, to avoid having to load F<feature.pm>. One setting of +the hint bits indicates a "custom" feature bundle, which means that the +entries in C<%^H> still apply. F<feature.pm> uses that. + +The C<HINT_FEATURE_MASK> macro is defined in F<perl.h> along with other +hints. Other macros for setting and testing features and bundles are in +the new F<feature.h>. C<FEATURE_IS_ENABLED> (which has moved to +F<feature.h>) is no longer used throughout the codebase, but more specific +macros, e.g., C<FEATURE_SAY_IS_ENABLED>, that are defined in F<feature.h>. + +=item * + +F<lib/feature.pm> is now a generated file, created by the new +F<regen/feature.pl> script, which also generates F<feature.h>. + +=item * + +Tied arrays are now always C<AvREAL>. If C<@_> or C<DB::args> is tied, it +is reified first, to make sure this is always the case. + +=item * + +Two new functions C<utf8_to_uvchr_buf()> and C<utf8_to_uvuni_buf()> have +been added. These are the same as C<utf8_to_uvchr> and +C<utf8_to_uvuni> (which are now deprecated), but take an extra parameter +that is used to guard against reading beyond the end of the input +string. +See L<perlapi/utf8_to_uvchr_buf> and L<perlapi/utf8_to_uvuni_buf>. + +=item * + +The regular expression engine now does TRIE case insensitive matches +under Unicode. This may change the output of C<< use re 'debug'; >>, +and will speed up various things. + +=item * + +There is a new C<wrap_op_checker()> function, which provides a thread-safe +alternative to writing to C<PL_check> directly. + +=back + +=head1 Selected Bug Fixes + +=head2 Array and hash + +=over + +=item * + +A bug has been fixed that would cause a "Use of freed value in iteration" +error if the next two hash elements that would be iterated over are +deleted [perl #85026]. (5.14.1) + +=item * + +Deleting the current hash iterator (the hash element that would be returned +by the next call to C<each>) in void context used not to free it +[perl #85026]. + +=item * + +Deletion of methods via C<delete $Class::{method}> syntax used to update +method caches if called in void context, but not scalar or list context. + +=item * + +When hash elements are deleted in void context, the internal hash entry is +now freed before the value is freed, to prevent destructors called by that +latter freeing from seeing the hash in an inconsistent state. It was +possible to cause double-frees if the destructor freed the hash itself +[perl #100340]. + +=item * + +A C<keys> optimization in Perl 5.12.0 to make it faster on empty hashes +caused C<each> not to reset the iterator if called after the last element +was deleted. + +=item * + +Freeing deeply nested hashes no longer crashes [perl #44225]. + +=item * + +It is possible from XS code to create hashes with elements that have no +values. The hash element and slice operators used to crash +when handling these in lvalue context. They now +produce a "Modification of non-creatable hash value attempted" error +message. + +=item * + +If list assignment to a hash or array triggered destructors that freed the +hash or array itself, a crash would ensue. This is no longer the case +[perl #107440]. + +=item * + +It used to be possible to free the typeglob of a localized array or hash +(e.g., C<local @{"x"}; delete $::{x}>), resulting in a crash on scope exit. + +=item * + +Some core bugs affecting L<Hash::Util> have been fixed: locking a hash +element that is a glob copy no longer causes the next assignment to it to +corrupt the glob (5.14.2), and unlocking a hash element that holds a +copy-on-write scalar no longer causes modifications to that scalar to +modify other scalars that were sharing the same string buffer. + +=back + +=head2 C API fixes + +=over + +=item * + +The C<newHVhv> XS function now works on tied hashes, instead of crashing or +returning an empty hash. + +=item * + +The C<SvIsCOW> C macro now returns false for read-only copies of typeglobs, +such as those created by: + + $hash{elem} = *foo; + Hash::Util::lock_value %hash, 'elem'; + +It used to return true. + +=item * + +The C<SvPVutf8> C function no longer tries to modify its argument, +resulting in errors [perl #108994]. + +=item * + +C<SvPVutf8> now works properly with magical variables. + +=item * + +C<SvPVbyte> now works properly non-PVs. + +=item * + +When presented with malformed UTF-8 input, the XS-callable functions +C<is_utf8_string()>, C<is_utf8_string_loc()>, and +C<is_utf8_string_loclen()> could read beyond the end of the input +string by up to 12 bytes. This no longer happens. [perl #32080]. +However, currently, C<is_utf8_char()> still has this defect, see +L</is_utf8_char()> above. + +=item * + +The C-level C<pregcomp> function could become confused about whether the +pattern was in UTF8 if the pattern was an overloaded, tied, or otherwise +magical scalar [perl #101940]. + +=back + +=head2 Compile-time hints + +=over + +=item * + +Tying C<%^H> no longer causes perl to crash or ignore the contents of +C<%^H> when entering a compilation scope [perl #106282]. + +=item * + +C<eval $string> and C<require> used not to +localize C<%^H> during compilation if it +was empty at the time the C<eval> call itself was compiled. This could +lead to scary side effects, like C<use re "/m"> enabling other flags that +the surrounding code was trying to enable for its caller [perl #68750]. + +=item * + +C<eval $string> and C<require> no longer localize hints (C<$^H> and C<%^H>) +at run time, but only during compilation of the $string or required file. +This makes C<BEGIN { $^H{foo}=7 }> equivalent to +C<BEGIN { eval '$^H{foo}=7' }> [perl #70151]. + +=item * + +Creating a BEGIN block from XS code (via C<newXS> or C<newATTRSUB>) would, +on completion, make the hints of the current compiling code the current +hints. This could cause warnings to occur in a non-warning scope. + +=back + +=head2 Copy-on-write scalars + +Copy-on-write or shared hash key scalars +were introduced in 5.8.0, but most Perl code +did not encounter them (they were used mostly internally). Perl +5.10.0 extended them, such that assigning C<__PACKAGE__> or a +hash key to a scalar would make it copy-on-write. Several parts +of Perl were not updated to account for them, but have now been fixed. + +=over + +=item * + +C<utf8::decode> had a nasty bug that would modify copy-on-write scalars' +string buffers in place (i.e., skipping the copy). This could result in +hashes having two elements with the same key [perl #91834]. (5.14.2) + +=item * + +Lvalue subroutines were not allowing COW scalars to be returned. This was +fixed for lvalue scalar context in Perl 5.12.3 and 5.14.0, but list context +was not fixed until this release. + +=item * + +Elements of restricted hashes (see the L<fields> pragma) containing +copy-on-write values couldn't be deleted, nor could such hashes be cleared +(C<%hash = ()>). (5.14.2) + +=item * + +Localizing a tied variable used to make it read-only if it contained a +copy-on-write string. (5.14.2) + +=item * + +Assigning a copy-on-write string to a stash +element no longer causes a double free. Regardless of this change, the +results of such assignments are still undefined. + +=item * + +Assigning a copy-on-write string to a tied variable no longer stops that +variable from being tied if it happens to be a PVMG or PVLV internally. + +=item * + +Doing a substitution on a tied variable returning a copy-on-write +scalar used to cause an assertion failure or an "Attempt to free +nonexistent shared string" warning. + +=item * + +This one is a regression from 5.12: In 5.14.0, the bitwise assignment +operators C<|=>, C<^=> and C<&=> started leaving the left-hand side +undefined if it happened to be a copy-on-write string [perl #108480]. + +=item * + +L<Storable>, L<Devel::Peek> and L<PerlIO::scalar> had similar problems. +See L</Updated Modules and Pragmata>, above. + +=back + +=head2 The debugger + +=over + +=item * + +F<dumpvar.pl>, and therefore the C<x> command in the debugger, have been +fixed to handle objects blessed into classes whose names contain "=". The +contents of such objects used not to be dumped [perl #101814]. + +=item * + +The "R" command for restarting a debugger session has been fixed to work on +Windows, or any other system lacking a C<POSIX::_SC_OPEN_MAX> constant +[perl #87740]. + +=item * + +The C<#line 42 foo> directive used not to update the arrays of lines used +by the debugger if it occurred in a string eval. This was partially fixed +in 5.14, but it worked only for a single C<#line 42 foo> in each eval. Now +it works for multiple. + +=item * + +When subroutine calls are intercepted by the debugger, the name of the +subroutine or a reference to it is stored in C<$DB::sub>, for the debugger +to access. Sometimes (such as C<$foo = *bar; undef *bar; &$foo>) +C<$DB::sub> would be set to a name that could not be used to find the +subroutine, and so the debugger's attempt to call it would fail. Now the +check to see whether a reference is needed is more robust, so those +problems should not happen anymore [rt.cpan.org #69862]. + +=item * + +Every subroutine has a filename associated with it that the debugger uses. +The one associated with constant subroutines used to be misallocated when +cloned under threads. Consequently, debugging threaded applications could +result in memory corruption [perl #96126]. + +=back + +=head2 Dereferencing operators + +=over + +=item * + +C<defined(${"..."})>, C<defined(*{"..."})>, etc., used to +return true for most, but not all built-in variables, if +they had not been used yet. This bug affected C<${^GLOBAL_PHASE}> and +C<${^UTF8CACHE}>, among others. It also used to return false if the +package name was given as well (C<${"::!"}>) [perl #97978, #97492]. + +=item * + +Perl 5.10.0 introduced a similar bug: C<defined(*{"foo"})> where "foo" +represents the name of a built-in global variable used to return false if +the variable had never been used before, but only on the I<first> call. +This, too, has been fixed. + +=item * + +Since 5.6.0, C<*{ ... }> has been inconsistent in how it treats undefined +values. It would die in strict mode or lvalue context for most undefined +values, but would be treated as the empty string (with a warning) for the +specific scalar return by C<undef()> (C<&PL_sv_undef> internally). This +has been corrected. C<undef()> is now treated like other undefined +scalars, as in Perl 5.005. + +=back + +=head2 Filehandle, last-accessed + +Perl has an internal variable that stores the last filehandle to be +accessed. It is used by C<$.> and by C<tell> and C<eof> without +arguments. + +=over + +=item * + +It used to be possible to set this internal variable to a glob copy and +then modify that glob copy to be something other than a glob, and still +have the last-accessed filehandle associated with the variable after +assigning a glob to it again: + + my $foo = *STDOUT; # $foo is a glob copy + <$foo>; # $foo is now the last-accessed handle + $foo = 3; # no longer a glob + $foo = *STDERR; # still the last-accessed handle + +Now the C<$foo = 3> assignment unsets that internal variable, so there +is no last-accessed filehandle, just as if C<< <$foo> >> had never +happened. + +This also prevents some unrelated handle from becoming the last-accessed +handle if $foo falls out of scope and the same internal SV gets used for +another handle [perl #97988]. + +=item * + +A regression in 5.14 caused these statements not to set that internal +variable: + + my $fh = *STDOUT; + tell $fh; + eof $fh; + seek $fh, 0,0; + tell *$fh; + eof *$fh; + seek *$fh, 0,0; + readline *$fh; + +This is now fixed, but C<tell *{ *$fh }> still has the problem, and it +is not clear how to fix it [perl #106536]. + +=back + +=head2 Filetests and C<stat> + +The term "filetests" refers to the operators that consist of a hyphen +followed by a single letter: C<-r>, C<-x>, C<-M>, etc. The term "stacked" +when applied to filetests means followed by another filetest operator +sharing the same operand, as in C<-r -x -w $fooo>. + +=over + +=item * + +C<stat> produces more consistent warnings. It no longer warns for "_" +[perl #71002] and no longer skips the warning at times for other unopened +handles. It no longer warns about an unopened handle when the operating +system's C<fstat> function fails. + +=item * + +C<stat> would sometimes return negative numbers for large inode numbers, +because it was using the wrong internal C type. [perl #84590] + +=item * + +C<lstat> is documented to fall back to C<stat> (with a warning) when given +a filehandle. When passed an IO reference, it was actually doing the +equivalent of S<C<stat _>> and ignoring the handle. + +=item * + +C<-T _> with no preceding C<stat> used to produce a +confusing "uninitialized" warning, even though there +is no visible uninitialized value to speak of. + +=item * + +C<-T>, C<-B>, C<-l> and C<-t> now work +when stacked with other filetest operators +[perl #77388]. + +=item * + +In 5.14.0, filetest ops (C<-r>, C<-x>, etc.) started calling FETCH on a +tied argument belonging to the previous argument to a list operator, if +called with a bareword argument or no argument at all. This has been +fixed, so C<push @foo, $tied, -r> no longer calls FETCH on C<$tied>. + +=item * + +In Perl 5.6, C<-l> followed by anything other than a bareword would treat +its argument as a file name. That was changed in 5.8 for glob references +(C<\*foo>), but not for globs themselves (C<*foo>). C<-l> started +returning C<undef> for glob references without setting the last +stat buffer that the "_" handle uses, but only if warnings +were turned on. With warnings off, it was the same as 5.6. +In other words, it was simply buggy and inconsistent. Now the 5.6 +behavior has been restored. + +=item * + +C<-l> followed by a bareword no longer "eats" the previous argument to +the list operator in whose argument list it resides. Hence, +C<print "bar", -l foo> now actually prints "bar", because C<-l> +on longer eats it. + +=item * + +Perl keeps several internal variables to keep track of the last stat +buffer, from which file(handle) it originated, what type it was, and +whether the last stat succeeded. + +There were various cases where these could get out of synch, resulting in +inconsistent or erratic behavior in edge cases (every mention of C<-T> +applies to C<-B> as well): + +=over + +=item * + +C<-T I<HANDLE>>, even though it does a C<stat>, was not resetting the last +stat type, so an C<lstat _> following it would merrily return the wrong +results. Also, it was not setting the success status. + +=item * + +Freeing the handle last used by C<stat> or a filetest could result in +S<C<-T _>> using an unrelated handle. + +=item * + +C<stat> with an IO reference would not reset the stat type or record the +filehandle for S<C<-T _>> to use. + +=item * + +Fatal warnings could cause the stat buffer not to be reset +for a filetest operator on an unopened filehandle or C<-l> on any handle. +Fatal warnings also stopped C<-T> from setting C<$!>. + +=item * + +When the last stat was on an unreadable file, C<-T _> is supposed to +return C<undef>, leaving the last stat buffer unchanged. But it was +setting the stat type, causing C<lstat _> to stop working. + +=item * + +C<-T I<FILENAME>> was not resetting the internal stat buffers for +unreadable files. + +=back + +These have all been fixed. + +=back + +=head2 Formats + +=over + +=item * + +Several edge cases have been fixed with formats and C<formline>; +in particular, where the format itself is potentially variable (such as +with ties and overloading), and where the format and data differ in their +encoding. In both these cases, it used to possible for the output to be +corrupted [perl #91032]. + +=item * + +C<formline> no longer converts its argument into a string in-place. So +passing a reference to C<formline> no longer destroys the reference +[perl #79532]. + +=item * + +Assignment to C<$^A> (the format output accumulator) now recalculates +the number of lines output. + +=back + +=head2 C<given> and C<when> + +=over + +=item * + +C<given> was not scoping its implicit $_ properly, resulting in memory +leaks or "Variable is not available" warnings [perl #94682]. + +=item * + +C<given> was not calling set-magic on the implicit lexical C<$_> that it +uses. This meant, for example, that C<pos> would be remembered from one +execution of the same C<given> block to the next, even if the input were a +different variable [perl #84526]. + +=item * + +C<when> blocks are now capable of returning variables declared inside the +enclosing C<given> block [perl #93548]. + +=back + +=head2 The C<glob> operator + +=over + +=item * + +On OSes other than VMS, Perl's C<glob> operator (and the C<< <...> >> form) +use L<File::Glob> underneath. L<File::Glob> splits the pattern into words, +before feeding each word to its C<bsd_glob> function. + +There were several inconsistencies in the way the split was done. Now +quotation marks (' and ") are always treated as shell-style word delimiters +(that allow whitespace as part of a word) and backslashes are always +preserved, unless they exist to escape quotation marks. Before, those +would only sometimes be the case, depending on whether the pattern +contained whitespace. Also, escaped whitespace at the end of the pattern +is no longer stripped [perl #40470]. + +=item * + +C<CORE::glob> now works as a way to call the default globbing function. It +used to respect overrides, despite the C<CORE::> prefix. + +=item * + +Under miniperl (used to configure modules when perl itself is built), +C<glob> now clears %ENV before calling csh, since the latter croaks on some +systems if it does not like the contents of the LS_COLORS environment +variable [perl #98662]. + +=back + +=head2 Lvalue subroutines + +=over + +=item * + +Explicit return now returns the actual argument passed to return, instead +of copying it [perl #72724, #72706]. + +=item * + +Lvalue subroutines used to enforce lvalue syntax (i.e., whatever can go on +the left-hand side of C<=>) for the last statement and the arguments to +return. Since lvalue subroutines are not always called in lvalue context, +this restriction has been lifted. + +=item * + +Lvalue subroutines are less restrictive about what values can be returned. +It used to croak on values returned by C<shift> and C<delete> and from +other subroutines, but no longer does so [perl #71172]. + +=item * + +Empty lvalue subroutines (C<sub :lvalue {}>) used to return C<@_> in list +context. All subroutines used to do this, but regular subs were fixed in +Perl 5.8.2. Now lvalue subroutines have been likewise fixed. + +=item * + +Autovivification now works on values returned from lvalue subroutines +[perl #7946], as does returning C<keys> in lvalue context. + +=item * + +Lvalue subroutines used to copy their return values in rvalue context. Not +only was this a waste of CPU cycles, but it also caused bugs. A C<($)> +prototype would cause an lvalue sub to copy its return value [perl #51408], +and C<while(lvalue_sub() =~ m/.../g) { ... }> would loop endlessly +[perl #78680]. + +=item * + +When called in potential lvalue context +(e.g., subroutine arguments or a list +passed to C<for>), lvalue subroutines used to copy +any read-only value that was returned. E.g., C< sub :lvalue { $] } > +would not return C<$]>, but a copy of it. + +=item * + +When called in potential lvalue context, an lvalue subroutine returning +arrays or hashes used to bind the arrays or hashes to scalar variables, +resulting in bugs. This was fixed in 5.14.0 if an array were the first +thing returned from the subroutine (but not for C<$scalar, @array> or +hashes being returned). Now a more general fix has been applied +[perl #23790]. + +=item * + +Method calls whose arguments were all surrounded with C<my()> or C<our()> +(as in C<< $object->method(my($a,$b)) >>) used to force lvalue context on +the subroutine. This would prevent lvalue methods from returning certain +values. + +=item * + +Lvalue sub calls that are not determined to be such at compile time +(C<&$name> or &{"name"}) are no longer exempt from strict refs if they +occur in the last statement of an lvalue subroutine [perl #102486]. + +=item * + +Sub calls whose subs are not visible at compile time, if +they occurred in the last statement of an lvalue subroutine, +would reject non-lvalue subroutines and die with "Can't modify non-lvalue +subroutine call" [perl #102486]. + +Non-lvalue sub calls whose subs I<are> visible at compile time exhibited +the opposite bug. If the call occurred in the last statement of an lvalue +subroutine, there would be no error when the lvalue sub was called in +lvalue context. Perl would blindly assign to the temporary value returned +by the non-lvalue subroutine. + +=item * + +C<AUTOLOAD> routines used to take precedence over the actual sub being +called (i.e., when autoloading wasn't needed), for sub calls in lvalue or +potential lvalue context, if the subroutine was not visible at compile +time. + +=item * + +Applying the C<:lvalue> attribute to an XSUB or to an aliased subroutine +stub with C<< sub foo :lvalue; >> syntax stopped working in Perl 5.12. +This has been fixed. + +=item * + +Applying the :lvalue attribute to subroutine that is already defined does +not work properly, as the attribute changes the way the sub is compiled. +Hence, Perl 5.12 began warning when an attempt is made to apply the +attribute to an already defined sub. In such cases, the attribute is +discarded. + +But the change in 5.12 missed the case where custom attributes are also +present: that case still silently and ineffectively applied the attribute. +That omission has now been corrected. C<sub foo :lvalue :Whatever> (when +C<foo> is already defined) now warns about the :lvalue attribute, and does +not apply it. + +=item * + +A bug affecting lvalue context propagation through nested lvalue subroutine +calls has been fixed. Previously, returning a value in nested rvalue +context would be treated as lvalue context by the inner subroutine call, +resulting in some values (such as read-only values) being rejected. + +=back + +=head2 Overloading + +=over + +=item * + +Arithmetic assignment (C<$left += $right>) involving overloaded objects +that rely on the 'nomethod' override no longer segfault when the left +operand is not overloaded. + +=item * + +Errors that occur when methods cannot be found during overloading now +mention the correct package name, as they did in 5.8.x, instead of +erroneously mentioning the "overload" package, as they have since 5.10.0. + +=item * + +Undefining C<%overload::> no longer causes a crash. + +=back + +=head2 Prototypes of built-in keywords + +=over + +=item * + +The C<prototype> function no longer dies for the C<__FILE__>, C<__LINE__> +and C<__PACKAGE__> directives. It now returns an empty-string prototype +for them, because they are syntactically indistinguishable from nullary +functions like C<time>. + +=item * + +C<prototype> now returns C<undef> for all overridable infix operators, +such as C<eq>, which are not callable in any way resembling functions. +It used to return incorrect prototypes for some and die for others +[perl #94984]. + +=item * + +The prototypes of several built-in functions--C<getprotobynumber>, C<lock>, +C<not> and C<select>--have been corrected, or at least are now closer to +reality than before. + +=back + +=head2 Regular expressions + +=for comment Is it possible to merge some of these items? + +=over 4 + +=item * + +C</[[:ascii:]]/> and C</[[:blank:]]/> now use locale rules under +C<use locale> when the platform supports that. Previously, they used +the platform's native character set. + +=item * + +C<m/[[:ascii:]]/i> and C</\p{ASCII}/i> now match identically (when not +under a differing locale). This fixes a regression introduced in 5.14 +in which the first expression could match characters outside of ASCII, +such as the KELVIN SIGN. + +=item * + +C</.*/g> would sometimes refuse to match at the end of a string that ends +with "\n". This has been fixed [perl #109206]. + +=item * + +Starting with 5.12.0, Perl used to get its internal bookkeeping muddled up +after assigning C<${ qr// }> to a hash element and locking it with +L<Hash::Util>. This could result in double frees, crashes, or erratic +behavior. + +=item * + +The new (in 5.14.0) regular expression modifier C</a> when repeated like +C</aa> forbids the characters outside the ASCII range that match +characters inside that range from matching under C</i>. This did not +work under some circumstances, all involving alternation, such as: + + "\N{KELVIN SIGN}" =~ /k|foo/iaa; + +succeeded inappropriately. This is now fixed. + +=item * + +5.14.0 introduced some memory leaks in regular expression character +classes such as C<[\w\s]>, which have now been fixed. (5.14.1) + +=item * + +An edge case in regular expression matching could potentially loop. +This happened only under C</i> in bracketed character classes that have +characters with multi-character folds, and the target string to match +against includes the first portion of the fold, followed by another +character that has a multi-character fold that begins with the remaining +portion of the fold, plus some more. + + "s\N{U+DF}" =~ /[\x{DF}foo]/i + +is one such case. C<\xDF> folds to C<"ss">. (5.14.1) + +=item * + +A few characters in regular expression pattern matches did not +match correctly in some circumstances, all involving C</i>. The +affected characters are: +COMBINING GREEK YPOGEGRAMMENI, +GREEK CAPITAL LETTER IOTA, +GREEK CAPITAL LETTER UPSILON, +GREEK PROSGEGRAMMENI, +GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA, +GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, +GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA, +GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, +LATIN SMALL LETTER LONG S, +LATIN SMALL LIGATURE LONG S T, +and +LATIN SMALL LIGATURE ST. + +=item * + +A memory leak regression in regular expression compilation +under threading has been fixed. + +=item * + +A regression introduced in 5.14.0 has +been fixed. This involved an inverted +bracketed character class in a regular expression that consisted solely +of a Unicode property. That property wasn't getting inverted outside the +Latin1 range. + +=item * + +Three problematic Unicode characters now work better in regex pattern matching under C</i>. + +In the past, three Unicode characters: +LATIN SMALL LETTER SHARP S, +GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, +and +GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, +along with the sequences that they fold to +(including "ss" for LATIN SMALL LETTER SHARP S), +did not properly match under C</i>. 5.14.0 fixed some of these cases, +but introduced others, including a panic when one of the characters or +sequences was used in the C<(?(DEFINE)> regular expression predicate. +The known bugs that were introduced in 5.14 have now been fixed; as well +as some other edge cases that have never worked until now. These all +involve using the characters and sequences outside bracketed character +classes under C</i>. This closes [perl #98546]. + +There remain known problems when using certain characters with +multi-character folds inside bracketed character classes, including such +constructs as C<qr/[\N{LATIN SMALL LETTER SHARP}a-z]/i>. These +remaining bugs are addressed in [perl #89774]. + +=item * + +RT #78266: The regex engine has been leaking memory when accessing +named captures that weren't matched as part of a regex ever since 5.10 +when they were introduced; e.g., this would consume over a hundred MB of +memory: + + for (1..10_000_000) { + if ("foo" =~ /(foo|(?<capture>bar))?/) { + my $capture = $+{capture} + } + } + system "ps -o rss $$"' + +=item * + +In 5.14, C</[[:lower:]]/i> and C</[[:upper:]]/i> no longer matched the +opposite case. This has been fixed [perl #101970]. + +=item * + +A regular expression match with an overloaded object on the right-hand side +would sometimes stringify the object too many times. + +=item * + +A regression has been fixed that was introduced in 5.14, in C</i> +regular expression matching, in which a match improperly fails if the +pattern is in UTF-8, the target string is not, and a Latin-1 character +precedes a character in the string that should match the pattern. +[perl #101710] + +=item * + +In case-insensitive regular expression pattern matching, no longer on +UTF-8 encoded strings does the scan for the start of match look only at +the first possible position. This caused matches such as +C<"f\x{FB00}" =~ /ff/i> to fail. + +=item * + +The regexp optimizer no longer crashes on debugging builds when merging +fixed-string nodes with inconvenient contents. + +=item * + +A panic involving the combination of the regular expression modifiers +C</aa> and the C<\b> escape sequence introduced in 5.14.0 has been +fixed [perl #95964]. (5.14.2) + +=item * + +The combination of the regular expression modifiers C</aa> and the C<\b> +and C<\B> escape sequences did not work properly on UTF-8 encoded +strings. All non-ASCII characters under C</aa> should be treated as +non-word characters, but what was happening was that Unicode rules were +used to determine wordness/non-wordness for non-ASCII characters. This +is now fixed [perl #95968]. + +=item * + +C<< (?foo: ...) >> no longer loses passed in character set. + +=item * + +The trie optimization used to have problems with alternations containing +an empty C<(?:)>, causing C<< "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/ >> not to +match, whereas it should [perl #111842]. + +=item * + +Use of lexical (C<my>) variables in code blocks embedded in regular +expressions will no longer result in memory corruption or crashes. + +Nevertheless, these code blocks are still experimental, as there are still +problems with the wrong variables being closed over (in loops for instance) +and with abnormal exiting (e.g., C<die>) causing memory corruption. + +=item * + +The C<\h>, C<\H>, C<\v> and C<\V> regular expression metacharacters used to +cause a panic error message when trying to match at the end of the +string [perl #96354]. + +=item * + +The abbreviations for four C1 control characters C<MW> C<PM>, C<RI>, and +C<ST> were previously unrecognized by C<\N{}>, vianame(), and +string_vianame(). + +=item * + +Mentioning a variable named "&" other than C<$&> (i.e., C<@&> or C<%&>) no +longer stops C<$&> from working. The same applies to variables named "'" +and "`" [perl #24237]. + +=item * + +Creating a C<UNIVERSAL::AUTOLOAD> sub no longer stops C<%+>, C<%-> and +C<%!> from working some of the time [perl #105024]. + +=back + +=head2 Smartmatching + +=over + +=item * + +C<~~> now correctly handles the precedence of Any~~Object, and is not tricked +by an overloaded object on the left-hand side. + +=item * + +In Perl 5.14.0, C<$tainted ~~ @array> stopped working properly. Sometimes +it would erroneously fail (when C<$tainted> contained a string that occurs +in the array I<after> the first element) or erroneously succeed (when +C<undef> occurred after the first element) [perl #93590]. + +=back + +=head2 The C<sort> operator + +=over + +=item * + +C<sort> was not treating C<sub {}> and C<sub {()}> as equivalent when +such a sub was provided as the comparison routine. It used to croak on +C<sub {()}>. + +=item * + +C<sort> now works once more with custom sort routines that are XSUBs. It +stopped working in 5.10.0. + +=item * + +C<sort> with a constant for a custom sort routine, although it produces +unsorted results, no longer crashes. It started crashing in 5.10.0. + +=item * + +Warnings emitted by C<sort> when a custom comparison routine returns a +non-numeric value now contain "in sort" and show the line number of the +C<sort> operator, rather than the last line of the comparison routine. The +warnings also now occur only if warnings are enabled in the scope where +C<sort> occurs. Previously the warnings would occur if enabled in the +comparison routine's scope. + +=item * + +C<< sort { $a <=> $b } >>, which is optimized internally, now produces +"uninitialized" warnings for NaNs (not-a-number values), since C<< <=> >> +returns C<undef> for those. This brings it in line with +S<C<< sort { 1; $a <=> $b } >>> and other more complex cases, which are not +optimized [perl #94390]. + +=back + +=head2 The C<substr> operator + +=over + +=item * + +Tied (and otherwise magical) variables are no longer exempt from the +"Attempt to use reference as lvalue in substr" warning. + +=item * + +That warning now occurs when the returned lvalue is assigned to, not +when C<substr> itself is called. This makes a difference only if the +return value of C<substr> is referenced and later assigned to. + +=item * + +Passing a substring of a read-only value or a typeglob to a function +(potential lvalue context) no longer causes an immediate "Can't coerce" +or "Modification of a read-only value" error. That error occurs only +if the passed value is assigned to. + +The same thing happens with the "substr outside of string" error. If +the lvalue is only read from, not written to, it is now just a warning, as +with rvalue C<substr>. + +=item * + +C<substr> assignments no longer call FETCH twice if the first argument +is a tied variable, just once. + +=back + +=head2 Support for embedded nulls + +Some parts of Perl did not work correctly with nulls (C<chr 0>) embedded in +strings. That meant that, for instance, C<< $m = "a\0b"; foo->$m >> would +call the "a" method, instead of the actual method name contained in $m. +These parts of perl have been fixed to support nulls: + +=over + +=item * + +Method names + +=item * + +Typeglob names (including filehandle and subroutine names) + +=item * + +Package names, including the return value of C<ref()> + +=item * + +Typeglob elements (C<*foo{"THING\0stuff"}>) + +=item * + +Signal names + +=item * + +Various warnings and error messages that mention variable names or values, +methods, etc. + +=back + +One side effect of these changes is that blessing into "\0" no longer +causes C<ref()> to return false. + +=head2 Threading bugs + +=over + +=item * + +Typeglobs returned from threads are no longer cloned if the parent thread +already has a glob with the same name. This means that returned +subroutines will now assign to the right package variables [perl #107366]. + +=item * + +Some cases of threads crashing due to memory allocation during cloning have +been fixed [perl #90006]. + +=item * + +Thread joining would sometimes emit "Attempt to free unreferenced scalar" +warnings if C<caller> had been used from the C<DB> package before thread +creation [perl #98092]. + +=item * + +Locking a subroutine (via C<lock &sub>) is no longer a compile-time error +for regular subs. For lvalue subroutines, it no longer tries to return the +sub as a scalar, resulting in strange side effects like C<ref \$_> +returning "CODE" in some instances. + +C<lock &sub> is now a run-time error if L<threads::shared> is loaded (a +no-op otherwise), but that may be rectified in a future version. + +=back + +=head2 Tied variables + +=over + +=item * + +Various cases in which FETCH was being ignored or called too many times +have been fixed: + +=over + +=item * + +C<PerlIO::get_layers> [perl #97956] + +=item * + +C<$tied =~ y/a/b/>, C<chop $tied> and C<chomp $tied> when $tied holds a +reference. + +=item * + +When calling C<local $_> [perl #105912] + +=item * + +Four-argument C<select> + +=item * + +A tied buffer passed to C<sysread> + +=item * + +C<< $tied .= <> >> + +=item * + +Three-argument C<open>, the third being a tied file handle +(as in C<< open $fh, ">&", $tied >>) + +=item * + +C<sort> with a reference to a tied glob for the comparison routine. + +=item * + +C<..> and C<...> in list context [perl #53554]. + +=item * + +C<${$tied}>, C<@{$tied}>, C<%{$tied}> and C<*{$tied}> where the tied +variable returns a string (C<&{}> was unaffected) + +=item * + +C<defined ${ $tied_variable }> + +=item * + +Various functions that take a filehandle argument in rvalue context +(C<close>, C<readline>, etc.) [perl #97482] + +=item * + +Some cases of dereferencing a complex expression, such as +C<${ (), $tied } = 1>, used to call C<FETCH> multiple times, but now call +it once. + +=item * + +C<$tied-E<gt>method> where $tied returns a package name--even resulting in +a failure to call the method, due to memory corruption + +=item * + +Assignments like C<*$tied = \&{"..."}> and C<*glob = $tied> + +=item * + +C<chdir>, C<chmod>, C<chown>, C<utime>, C<truncate>, C<stat>, C<lstat> and +the filetest ops (C<-r>, C<-x>, etc.) + +=back + +=item * + +C<caller> sets C<@DB::args> to the subroutine arguments when called from +the DB package. It used to crash when doing so if C<@DB::args> happened to +be tied. Now it croaks instead. + +=item * + +Tying an element of %ENV or C<%^H> and then deleting that element would +result in a call to the tie object's DELETE method, even though tying the +element itself is supposed to be equivalent to tying a scalar (the element +is, of course, a scalar) [perl #67490]. + +=item * + +When Perl autovivifies an element of a tied array or hash (which entails +calling STORE with a new reference), it now calls FETCH immediately after +the STORE, instead of assuming that FETCH would have returned the same +reference. This can make it easier to implement tied objects [perl #35865, #43011]. + +=item * + +Four-argument C<select> no longer produces its "Non-string passed as +bitmask" warning on tied or tainted variables that are strings. + +=item * + +Localizing a tied scalar that returns a typeglob no longer stops it from +being tied till the end of the scope. + +=item * + +Attempting to C<goto> out of a tied handle method used to cause memory +corruption or crashes. Now it produces an error message instead +[perl #8611]. + +=item * + +A bug has been fixed that occurs when a tied variable is used as a +subroutine reference: if the last thing assigned to or returned from the +variable was a reference or typeglob, the C<\&$tied> could either crash or +return the wrong subroutine. The reference case is a regression introduced +in Perl 5.10.0. For typeglobs, it has probably never worked till now. + +=back + +=head2 Version objects and vstrings + +=over + +=item * + +The bitwise complement operator (and possibly other operators, too) when +passed a vstring would leave vstring magic attached to the return value, +even though the string had changed. This meant that +C<< version->new(~v1.2.3) >> would create a version looking like "v1.2.3" +even though the string passed to C<< version->new >> was actually +"\376\375\374". This also caused L<B::Deparse> to deparse C<~v1.2.3> +incorrectly, without the C<~> [perl #29070]. + +=item * + +Assigning a vstring to a magic (e.g., tied, C<$!>) variable and then +assigning something else used to blow away all magic. This meant that +tied variables would come undone, C<$!> would stop getting updated on +failed system calls, C<$|> would stop setting autoflush, and other +mischief would take place. This has been fixed. + +=item * + +C<< version->new("version") >> and C<printf "%vd", "version"> no longer +crash [perl #102586]. + +=item * + +Version comparisons, such as those that happen implicitly with C<use +v5.43>, no longer cause locale settings to change [perl #105784]. + +=item * + +Version objects no longer cause memory leaks in boolean context +[perl #109762]. + +=back + +=head2 Warnings, redefinition + +=over + +=item * + +Subroutines from the C<autouse> namespace are once more exempt from +redefinition warnings. This used to work in 5.005, but was broken in +5.6 for most subroutines. For subs created via XS that redefine +subroutines from the C<autouse> package, this stopped working in 5.10. + +=item * + +New XSUBs now produce redefinition warnings if they overwrite existing +subs, as they did in 5.8.x. (The C<autouse> logic was reversed in +5.10-14. Only subroutines from the C<autouse> namespace would warn +when clobbered.) + +=item * + +C<newCONSTSUB> used to use compile-time warning hints, instead of +run-time hints. The following code should never produce a redefinition +warning, but it used to, if C<newCONSTSUB> redefined an existing +subroutine: + + use warnings; + BEGIN { + no warnings; + some_XS_function_that_calls_new_CONSTSUB(); + } + +=item * + +Redefinition warnings for constant subroutines are on by default (what +are known as severe warnings in L<perldiag>). This occurred only +when it was a glob assignment or declaration of a Perl subroutine that +caused the warning. If the creation of XSUBs triggered the warning, it +was not a default warning. This has been corrected. + +=item * + +The internal check to see whether a redefinition warning should occur +used to emit "uninitialized" warnings in cases like this: + + use warnings "uninitialized"; + use constant {u => undef, v => undef}; + sub foo(){u} + sub foo(){v} + +=back + +=head2 Warnings, "Uninitialized" + +=over + +=item * + +Various functions that take a filehandle argument in rvalue context +(C<close>, C<readline>, etc.) used to warn twice for an undefined handle +[perl #97482]. + +=item * + +C<dbmopen> now only warns once, rather than three times, if the mode +argument is C<undef> [perl #90064]. + +=item * + +The C<+=> operator does not usually warn when the left-hand side is +C<undef>, but it was doing so for tied variables. This has been fixed +[perl #44895]. + +=item * + +A bug fix in Perl 5.14 introduced a new bug, causing "uninitialized" +warnings to report the wrong variable if the operator in question had +two operands and one was C<%{...}> or C<@{...}>. This has been fixed +[perl #103766]. + +=item * + +C<..> and C<...> in list context now mention the name of the variable in +"uninitialized" warnings for string (as opposed to numeric) ranges. + +=back + +=head2 Weak references + +=over + +=item * + +Weakening the first argument to an automatically-invoked C<DESTROY> method +could result in erroneous "DESTROY created new reference" errors or +crashes. Now it is an error to weaken a read-only reference. + +=item * + +Weak references to lexical hashes going out of scope were not going stale +(becoming undefined), but continued to point to the hash. + +=item * + +Weak references to lexical variables going out of scope are now broken +before any magical methods (e.g., DESTROY on a tie object) are called. +This prevents such methods from modifying the variable that will be seen +the next time the scope is entered. + +=item * + +Creating a weak reference to an @ISA array or accessing the array index +(C<$#ISA>) could result in confused internal bookkeeping for elements +later added to the @ISA array. For instance, creating a weak +reference to the element itself could push that weak reference on to @ISA; +and elements added after use of C<$#ISA> would be ignored by method lookup +[perl #85670]. + +=back + +=head2 Other notable fixes + +=over + +=item * + +C<quotemeta> now quotes consistently the same non-ASCII characters under +C<use feature 'unicode_strings'>, regardless of whether the string is +encoded in UTF-8 or not, hence fixing the last vestiges (we hope) of the +notorious L<perlunicode/The "Unicode Bug">. [perl #77654]. + +Which of these code points is quoted has changed, based on Unicode's +recommendations. See L<perlfunc/quotemeta> for details. + +=item * + +C<study> is now a no-op, presumably fixing all outstanding bugs related to +study causing regex matches to behave incorrectly! + +=item * + +When one writes C<open foo || die>, which used to work in Perl 4, a +"Precedence problem" warning is produced. This warning used erroneously to +apply to fully-qualified bareword handle names not followed by C<||>. This +has been corrected. + +=item * + +After package aliasing (C<*foo:: = *bar::>), C<select> with 0 or 1 argument +would sometimes return a name that could not be used to refer to the +filehandle, or sometimes it would return C<undef> even when a filehandle +was selected. Now it returns a typeglob reference in such cases. + +=item * + +C<PerlIO::get_layers> no longer ignores some arguments that it thinks are +numeric, while treating others as filehandle names. It is now consistent +for flat scalars (i.e., not references). + +=item * + +Unrecognized switches on C<#!> line + +If a switch, such as B<-x>, that cannot occur on the C<#!> line is used +there, perl dies with "Can't emulate...". + +It used to produce the same message for switches that perl did not +recognize at all, whether on the command line or the C<#!> line. + +Now it produces the "Unrecognized switch" error message [perl #104288]. + +=item * + +C<system> now temporarily blocks the SIGCHLD signal handler, to prevent the +signal handler from stealing the exit status [perl #105700]. + +=item * + +The %n formatting code for C<printf> and C<sprintf>, which causes the number +of characters to be assigned to the next argument, now actually +assigns the number of characters, instead of the number of bytes. + +It also works now with special lvalue functions like C<substr> and with +nonexistent hash and array elements [perl #3471, #103492]. + +=item * + +Perl skips copying values returned from a subroutine, for the sake of +speed, if doing so would make no observable difference. Because of faulty +logic, this would happen with the +result of C<delete>, C<shift> or C<splice>, even if the result was +referenced elsewhere. It also did so with tied variables about to be freed +[perl #91844, #95548]. + +=item * + +C<utf8::decode> now refuses to modify read-only scalars [perl #91850]. + +=item * + +Freeing $_ inside a C<grep> or C<map> block, a code block embedded in a +regular expression, or an @INC filter (a subroutine returned by a +subroutine in @INC) used to result in double frees or crashes +[perl #91880, #92254, #92256]. + +=item * + +C<eval> returns C<undef> in scalar context or an empty list in list +context when there is a run-time error. When C<eval> was passed a +string in list context and a syntax error occurred, it used to return a +list containing a single undefined element. Now it returns an empty +list in list context for all errors [perl #80630]. + +=item * + +C<goto &func> no longer crashes, but produces an error message, when +the unwinding of the current subroutine's scope fires a destructor that +undefines the subroutine being "goneto" [perl #99850]. + +=item * + +Perl now holds an extra reference count on the package that code is +currently compiling in. This means that the following code no longer +crashes [perl #101486]: + + package Foo; + BEGIN {*Foo:: = *Bar::} + sub foo; + +=item * + +The C<x> repetition operator no longer crashes on 64-bit builds with large +repeat counts [perl #94560]. + +=item * + +Calling C<require> on an implicit C<$_> when C<*CORE::GLOBAL::require> has +been overridden does not segfault anymore, and C<$_> is now passed to the +overriding subroutine [perl #78260]. + +=item * + +C<use> and C<require> are no longer affected by the I/O layers active in +the caller's scope (enabled by L<open.pm|open>) [perl #96008]. + +=item * + +C<our $::é; $é> (which is invalid) no longer produces the "Compilation +error at lib/utf8_heavy.pl..." error message, which it started emitting in +5.10.0 [perl #99984]. + +=item * + +On 64-bit systems, C<read()> now understands large string offsets beyond +the 32-bit range. + +=item * + +Errors that occur when processing subroutine attributes no longer cause the +subroutine's op tree to leak. + +=item * + +Passing the same constant subroutine to both C<index> and C<formline> no +longer causes one or the other to fail [perl #89218]. (5.14.1) + +=item * + +List assignment to lexical variables declared with attributes in the same +statement (C<my ($x,@y) : blimp = (72,94)>) stopped working in Perl 5.8.0. +It has now been fixed. + +=item * + +Perl 5.10.0 introduced some faulty logic that made "U*" in the middle of +a pack template equivalent to "U0" if the input string was empty. This has +been fixed [perl #90160]. (5.14.2) + +=item * + +Destructors on objects were not called during global destruction on objects +that were not referenced by any scalars. This could happen if an array +element were blessed (e.g., C<bless \$a[0]>) or if a closure referenced a +blessed variable (C<bless \my @a; sub foo { @a }>). + +Now there is an extra pass during global destruction to fire destructors on +any objects that might be left after the usual passes that check for +objects referenced by scalars [perl #36347]. + +=item * + +Fixed a case where it was possible that a freed buffer may have been read +from when parsing a here document [perl #90128]. (5.14.1) + +=item * + +C<each(I<ARRAY>)> is now wrapped in C<defined(...)>, like C<each(I<HASH>)>, +inside a C<while> condition [perl #90888]. + +=item * + +A problem with context propagation when a C<do> block is an argument to +C<return> has been fixed. It used to cause C<undef> to be returned in +certain cases of a C<return> inside an C<if> block which itself is followed by +another C<return>. + +=item * + +Calling C<index> with a tainted constant no longer causes constants in +subsequently compiled code to become tainted [perl #64804]. + +=item * + +Infinite loops like C<1 while 1> used to stop C<strict 'subs'> mode from +working for the rest of the block. + +=item * + +For list assignments like C<($a,$b) = ($b,$a)>, Perl has to make a copy of +the items on the right-hand side before assignment them to the left. For +efficiency's sake, it assigns the values on the right straight to the items +on the left if no one variable is mentioned on both sides, as in C<($a,$b) = +($c,$d)>. The logic for determining when it can cheat was faulty, in that +C<&&> and C<||> on the right-hand side could fool it. So C<($a,$b) = +$some_true_value && ($b,$a)> would end up assigning the value of C<$b> to +both scalars. + +=item * + +Perl no longer tries to apply lvalue context to the string in +C<("string", $variable) ||= 1> (which used to be an error). Since the +left-hand side of C<||=> is evaluated in scalar context, that's a scalar +comma operator, which gives all but the last item void context. There is +no such thing as void lvalue context, so it was a mistake for Perl to try +to force it [perl #96942]. + +=item * + +C<caller> no longer leaks memory when called from the DB package if +C<@DB::args> was assigned to after the first call to C<caller>. L<Carp> +was triggering this bug [perl #97010]. (5.14.2) + +=item * + +C<close> and similar filehandle functions, when called on built-in global +variables (like C<$+>), used to die if the variable happened to hold the +undefined value, instead of producing the usual "Use of uninitialized +value" warning. + +=item * + +When autovivified file handles were introduced in Perl 5.6.0, C<readline> +was inadvertently made to autovivify when called as C<readline($foo)> (but +not as C<E<lt>$fooE<gt>>). It has now been fixed never to autovivify. + +=item * + +Calling an undefined anonymous subroutine (e.g., what $x holds after +C<undef &{$x = sub{}}>) used to cause a "Not a CODE reference" error, which +has been corrected to "Undefined subroutine called" [perl #71154]. + +=item * + +Causing C<@DB::args> to be freed between uses of C<caller> no longer +results in a crash [perl #93320]. + +=item * + +C<setpgrp($foo)> used to be equivalent to C<($foo, setpgrp)>, because +C<setpgrp> was ignoring its argument if there was just one. Now it is +equivalent to C<setpgrp($foo,0)>. + +=item * + +C<shmread> was not setting the scalar flags correctly when reading from +shared memory, causing the existing cached numeric representation in the +scalar to persist [perl #98480]. + +=item * + +C<++> and C<--> now work on copies of globs, instead of dying. + +=item * + +C<splice()> doesn't warn when truncating + +You can now limit the size of an array using C<splice(@a,MAX_LEN)> without +worrying about warnings. + +=item * + +C<< $$ >> is no longer tainted. Since this value comes directly from +C<< getpid() >>, it is always safe. + +=item * + +The parser no longer leaks a filehandle if STDIN was closed before parsing +started [perl #37033]. + +=item * + +C<< die; >> with a non-reference, non-string, or magical (e.g., tainted) +value in $@ now properly propagates that value [perl #111654]. + +=back + +=head1 Known Problems + +=over 4 + +=item * + +On Solaris, we have two kinds of failure. + +If F<make> is Sun's F<make>, we get an error about a badly formed macro +assignment in the F<Makefile>. That happens when F<./Configure> tries to +make depends. F<Configure> then exits 0, but further F<make>-ing fails. + +If F<make> is F<gmake>, F<Configure> completes, then we get errors related +to F</usr/include/stdbool.h> + +=item * + +On Win32, a number of tests hang unless STDERR is redirected. The cause of +this is still under investigation. + +=item * + +When building as root with a umask that prevents files from being +other-readable, F<t/op/filetest.t> will fail. This is a test bug, not a +bug in perl's behavior. + +=item * + +Configuring with a recent gcc and link-time-optimization, such as +C<Configure -Doptimize='-O2 -flto'> fails +because the optimizer optimizes away some of Configure's tests. A +workaround is to omit the C<-flto> flag when running Configure, but add +it back in while actually building, something like + + sh Configure -Doptimize=-O2 + make OPTIMIZE='-O2 -flto' + +=item * + +The following CPAN modules have test failures with perl 5.16. Patches have +been submitted for all of these, so hopefully there will be new releases +soon: + +=over + +=item * + +L<Date::Pcalc> version 6.1 + +=item * + +L<Module::CPANTS::Analyse> version 0.85 + +This fails due to problems in L<Module::Find> 0.10 and L<File::MMagic> +1.27. + +=item * + +L<PerlIO::Util> version 0.72 + +=back + +=back + +=head1 Acknowledgements + +Perl 5.16.0 represents approximately 12 months of development since Perl +5.14.0 and contains approximately 590,000 lines of changes across 2,500 +files from 139 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.16.0: + +Aaron Crane, Abhijit Menon-Sen, Abigail, Alan Haggai Alavi, Alberto +Simões, Alexandr Ciornii, Andreas König, Andy Dougherty, Aristotle +Pagaltzis, Bo Johansson, Bo Lindbergh, Breno G. de Oliveira, brian d +foy, Brian Fraser, Brian Greenfield, Carl Hayter, Chas. Owens, +Chia-liang Kao, Chip Salzenberg, Chris 'BinGOs' Williams, Christian +Hansen, Christopher J. Madsen, chromatic, Claes Jacobsson, Claudio +Ramirez, Craig A. Berry, Damian Conway, Daniel Kahn Gillmor, Darin +McBride, Dave Rolsky, David Cantrell, David Golden, David Leadbeater, +David Mitchell, Dee Newcum, Dennis Kaarsemaker, Dominic Hargreaves, +Douglas Christopher Wilson, Eric Brine, Father Chrysostomos, Florian +Ragwitz, Frederic Briere, George Greer, Gerard Goossen, Gisle Aas, +H.Merijn Brand, Hojung Youn, Ian Goodacre, James E Keenan, Jan Dubois, +Jerry D. Hedden, Jesse Luehrs, Jesse Vincent, Jilles Tjoelker, Jim +Cromie, Jim Meyering, Joel Berger, Johan Vromans, Johannes Plunien, John +Hawkinson, John P. Linderman, John Peacock, Joshua ben Jore, Juerd +Waalboer, Karl Williamson, Karthik Rajagopalan, Keith Thompson, Kevin J. +Woolley, Kevin Ryde, Laurent Dami, Leo Lapworth, Leon Brocard, Leon +Timmermans, Louis Strous, Lukas Mai, Marc Green, Marcel Grünauer, Mark +A. Stratman, Mark Dootson, Mark Jason Dominus, Martin Hasch, Matthew +Horsfall, Max Maischein, Michael G Schwern, Michael Witten, Mike +Sheldrake, Moritz Lenz, Nicholas Clark, Niko Tyni, Nuno Carvalho, Pau +Amma, Paul Evans, Paul Green, Paul Johnson, Perlover, Peter John Acklam, +Peter Martini, Peter Scott, Phil Monsen, Pino Toscano, Rafael +Garcia-Suarez, Rainer Tammer, Reini Urban, Ricardo Signes, Robin Barker, +Rodolfo Carvalho, Salvador Fandiño, Sam Kimbrel, Samuel Thibault, Shawn +M Moore, Shigeya Suzuki, Shirakata Kentaro, Shlomi Fish, Sisyphus, +Slaven Rezic, Spiros Denaxas, Steffen Müller, Steffen Schwigon, Stephen +Bennett, Stephen Oberholtzer, Stevan Little, Steve Hay, Steve Peters, +Thomas Sibley, Thorsten Glaser, Timothe Litt, Todd Rinaldo, Tom +Christiansen, Tom Hukins, Tony Cook, Vadim Konovalov, Vincent Pit, +Vladimir Timofeev, Walt Mankowski, Yves Orton, Zefram, Zsbán Ambrus, +Ævar Arnfjörð Bjarmason. + +The list above is almost certainly incomplete as it is automatically +generated from version control history. In particular, it does not +include the names of the (very much appreciated) contributors who +reported issues to the Perl bug tracker. + +Many of the changes included in this version originated in the CPAN +modules included in Perl's core. We're grateful to the entire CPAN +community for helping Perl to flourish. + +For a more complete list of all of Perl's historical contributors, +please see the F<AUTHORS> file in the Perl source distribution. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at L<http://rt.perl.org/perlbug/>. There may also be +information at L<http://www.perl.org/>, the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please +send it to perl5-security-report@perl.org. This points to a closed +subscription unarchived mailing list, which includes all core +committers, who will be able to help assess the impact of issues, figure +out a resolution, and help co-ordinate the release of patches to +mitigate or fix the problem across all platforms on which Perl is +supported. Please use this address only for security issues in the Perl +core, not for modules independently distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5161delta.pod b/gnu/usr.bin/perl/pod/perl5161delta.pod new file mode 100644 index 00000000000..406e1cc3e2d --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5161delta.pod @@ -0,0 +1,198 @@ +=encoding utf8 + +=head1 NAME + +perl5161delta - what is new for perl v5.16.1 + +=head1 DESCRIPTION + +This document describes differences between the 5.16.0 release and +the 5.16.1 release. + +If you are upgrading from an earlier release such as 5.14.0, first read +L<perl5160delta>, which describes differences between 5.14.0 and +5.16.0. + +=head1 Security + +=head2 an off-by-two error in Scalar-List-Util has been fixed + +The bugfix was in Scalar-List-Util 1.23_04, and perl 5.16.1 includes +Scalar-List-Util 1.25. + +=head1 Incompatible Changes + +There are no changes intentionally incompatible with 5.16.0 If any +exist, they are bugs, and we request that you submit a report. See +L</Reporting Bugs> below. + +=head1 Modules and Pragmata + +=head2 Updated Modules and Pragmata + +=over 4 + +=item * + +L<Scalar::Util> and L<List::Util> have been upgraded from version 1.23 to +version 1.25. + +=item * + +L<B::Deparse> has been updated from version 1.14 to 1.14_01. An +"uninitialized" warning emitted by B::Deparse has been squashed +[perl #113464]. + +=back + +=head1 Configuration and Compilation + +=over + +=item * + +Building perl with some Windows compilers used to fail due to a problem +with miniperl's C<glob> operator (which uses the C<perlglob> program) +deleting the PATH environment variable [perl #113798]. + +=back + +=head1 Platform Support + +=head2 Platform-Specific Notes + +=over 4 + +=item VMS + +All C header files from the top-level directory of the distribution are now +installed on VMS, providing consistency with a long-standing practice on other +platforms. Previously only a subset were installed, which broke non-core extension +builds for extensions that depended on the missing include files. + +=back + +=head1 Selected Bug Fixes + +=over 4 + +=item * + +A regression introduced in Perl v5.16.0 involving +C<tr/I<SEARCHLIST>/I<REPLACEMENTLIST>/> has been fixed. Only the first +instance is supposed to be meaningful if a character appears more than +once in C<I<SEARCHLIST>>. Under some circumstances, the final instance +was overriding all earlier ones. [perl #113584] + +=item * + +C<B::COP::stashlen> has been added. This provides access to an internal +field added in perl 5.16 under threaded builds. It was broken at the last +minute before 5.16 was released [perl #113034]. + +=item * + +The L<re> pragma will no longer clobber C<$_>. [perl #113750] + +=item * + +Unicode 6.1 published an incorrect alias for one of the +Canonical_Combining_Class property's values (which range between 0 and +254). The alias C<CCC133> should have been C<CCC132>. Perl now +overrides the data file furnished by Unicode to give the correct value. + +=item * + +Duplicating scalar filehandles works again. [perl #113764] + +=item * + +Under threaded perls, a runtime code block in a regular expression could +corrupt the package name stored in the op tree, resulting in bad reads +in C<caller>, and possibly crashes [perl #113060]. + +=item * + +For efficiency's sake, many operators and built-in functions return the +same scalar each time. Lvalue subroutines and subroutines in the CORE:: +namespace were allowing this implementation detail to leak through. +C<print &CORE::uc("a"), &CORE::uc("b")> used to print "BB". The same thing +would happen with an lvalue subroutine returning the return value of C<uc>. +Now the value is copied in such cases [perl #113044]. + +=item * + +C<__SUB__> now works in special blocks (C<BEGIN>, C<END>, etc.). + +=item * + +Formats that reference lexical variables from outside no longer result +in crashes. + +=back + +=head1 Known Problems + +There are no new known problems, but consult L<perl5160delta/Known +Problems> to see those identified in the 5.16.0 release. + +=head1 Acknowledgements + +Perl 5.16.1 represents approximately 2 months of development since Perl +5.16.0 and contains approximately 14,000 lines of changes across 96 +files from 8 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.16.1: + +Chris 'BinGOs' Williams, Craig A. Berry, Father Chrysostomos, Karl +Williamson, Paul Johnson, Reini Urban, Ricardo Signes, Tony Cook. + +The list above is almost certainly incomplete as it is automatically +generated from version control history. In particular, it does not +include the names of the (very much appreciated) contributors who +reported issues to the Perl bug tracker. + +Many of the changes included in this version originated in the CPAN +modules included in Perl's core. We're grateful to the entire CPAN +community for helping Perl to flourish. + +For a more complete list of all of Perl's historical contributors, +please see the F<AUTHORS> file in the Perl source distribution. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please +send it to perl5-security-report@perl.org. This points to a closed +subscription unarchived mailing list, which includes all the core +committers, who will be able to help assess the impact of issues, figure +out a resolution, and help co-ordinate the release of patches to +mitigate or fix the problem across all platforms on which Perl is +supported. Please only use this address for security issues in the Perl +core, not for modules independently distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl5162delta.pod b/gnu/usr.bin/perl/pod/perl5162delta.pod new file mode 100644 index 00000000000..f87826f357a --- /dev/null +++ b/gnu/usr.bin/perl/pod/perl5162delta.pod @@ -0,0 +1,125 @@ +=encoding utf8 + +=head1 NAME + +perl5162delta - what is new for perl v5.16.2 + +=head1 DESCRIPTION + +This document describes differences between the 5.16.1 release and +the 5.16.2 release. + +If you are upgrading from an earlier release such as 5.16.0, first read +L<perl5161delta>, which describes differences between 5.16.0 and +5.16.1. + +=head1 Incompatible Changes + +There are no changes intentionally incompatible with 5.16.0 +If any exist, they are bugs, and we request that you submit a +report. See L</Reporting Bugs> below. + +=head1 Modules and Pragmata + +=head2 Updated Modules and Pragmata + +=over 4 + +=item * + +L<Module::CoreList> has been upgraded from version 2.70 to version 2.76. + +=back + +=head1 Configuration and Compilation + +=over 4 + +=item * configuration should no longer be confused by ls colorization + +=back + +=head1 Platform Support + +=head2 Platform-Specific Notes + +=over 4 + +=item AIX + +Configure now always adds -qlanglvl=extc99 to the CC flags on AIX when +using xlC. This will make it easier to compile a number of XS-based modules +that assume C99 [perl #113778]. + +=back + +=head1 Selected Bug Fixes + +=over 4 + +=item * fix /\h/ equivalence with /[\h]/ + +see [perl #114220] + +=back + +=head1 Known Problems + +There are no new known problems. + +=head1 Acknowledgements + +Perl 5.16.2 represents approximately 2 months of development since Perl +5.16.1 and contains approximately 740 lines of changes across 20 files +from 9 authors. + +Perl continues to flourish into its third decade thanks to a vibrant +community of users and developers. The following people are known to +have contributed the improvements that became Perl 5.16.2: + +Andy Dougherty, Craig A. Berry, Darin McBride, Dominic Hargreaves, Karen +Etheridge, Karl Williamson, Peter Martini, Ricardo Signes, Tony Cook. + +The list above is almost certainly incomplete as it is automatically +generated from version control history. In particular, it does not +include the names of the (very much appreciated) contributors who +reported issues to the Perl bug tracker. + +For a more complete list of all of Perl's historical contributors, +please see the F<AUTHORS> file in the Perl source distribution. + +=head1 Reporting Bugs + +If you find what you think is a bug, you might check the articles +recently posted to the comp.lang.perl.misc newsgroup and the perl +bug database at http://rt.perl.org/perlbug/ . There may also be +information at http://www.perl.org/ , the Perl Home Page. + +If you believe you have an unreported bug, please run the L<perlbug> +program included with your release. Be sure to trim your bug down +to a tiny but sufficient test case. Your bug report, along with the +output of C<perl -V>, will be sent off to perlbug@perl.org to be +analysed by the Perl porting team. + +If the bug you are reporting has security implications, which make it +inappropriate to send to a publicly archived mailing list, then please +send it to perl5-security-report@perl.org. This points to a closed +subscription unarchived mailing list, which includes all the core +committers, who will be able to help assess the impact of issues, figure +out a resolution, and help co-ordinate the release of patches to +mitigate or fix the problem across all platforms on which Perl is +supported. Please only use this address for security issues in the Perl +core, not for modules independently distributed on CPAN. + +=head1 SEE ALSO + +The F<Changes> file for an explanation of how to view exhaustive details +on what changed. + +The F<INSTALL> file for how to build Perl. + +The F<README> file for general stuff. + +The F<Artistic> and F<Copying> files for copyright information. + +=cut diff --git a/gnu/usr.bin/perl/pod/perl561delta.pod b/gnu/usr.bin/perl/pod/perl561delta.pod index 72c38f15a9b..49ff54f8983 100644 --- a/gnu/usr.bin/perl/pod/perl561delta.pod +++ b/gnu/usr.bin/perl/pod/perl561delta.pod @@ -1,6 +1,6 @@ =head1 NAME -perl561delta - what's new for perl v5.6.x +perl561delta - what's new for perl v5.6.1 =head1 DESCRIPTION @@ -2977,7 +2977,6 @@ You should use the new declaration syntax instead. The C<use attrs> pragma is now obsolete, and is only provided for backward-compatibility. See L<perlsub/"Subroutine Attributes">. - =item Premature end of script headers See Server error. @@ -3364,8 +3363,8 @@ Note that the above issue is not relevant to the default build of Perl, whose interfaces continue to match those of prior versions (but subject to the other options described here). -See L<perlguts/"The Perl API"> for detailed information on the -ramifications of building Perl with this option. +See L<perlguts/Background and PERL_IMPLICIT_CONTEXT> for detailed information +on the ramifications of building Perl with this option. NOTE: PERL_IMPLICIT_CONTEXT is automatically enabled whenever Perl is built with one of -Dusethreads, -Dusemultiplicity, or both. It is not diff --git a/gnu/usr.bin/perl/pod/perl56delta.pod b/gnu/usr.bin/perl/pod/perl56delta.pod index 91b4aede499..24c2072c253 100644 --- a/gnu/usr.bin/perl/pod/perl56delta.pod +++ b/gnu/usr.bin/perl/pod/perl56delta.pod @@ -2372,7 +2372,6 @@ You should use the new declaration syntax instead. The C<use attrs> pragma is now obsolete, and is only provided for backward-compatibility. See L<perlsub/"Subroutine Attributes">. - =item Premature end of script headers See Server error. @@ -2759,7 +2758,8 @@ Note that the above issue is not relevant to the default build of Perl, whose interfaces continue to match those of prior versions (but subject to the other options described here). -See L<perlguts/"The Perl API"> for detailed information on the + +See L<perlguts/Background and PERL_IMPLICIT_CONTEXT> for detailed information on the ramifications of building Perl with this option. NOTE: PERL_IMPLICIT_CONTEXT is automatically enabled whenever Perl is built diff --git a/gnu/usr.bin/perl/pod/perl588delta.pod b/gnu/usr.bin/perl/pod/perl588delta.pod index a3d1df35b38..b2203bcf71d 100644 --- a/gnu/usr.bin/perl/pod/perl588delta.pod +++ b/gnu/usr.bin/perl/pod/perl588delta.pod @@ -1,3 +1,5 @@ +=encoding utf8 + =head1 NAME perl588delta - what is new for perl v5.8.8 diff --git a/gnu/usr.bin/perl/pod/perl589delta.pod b/gnu/usr.bin/perl/pod/perl589delta.pod index 2070cc3aa44..f10099ccfa4 100644 --- a/gnu/usr.bin/perl/pod/perl589delta.pod +++ b/gnu/usr.bin/perl/pod/perl589delta.pod @@ -92,7 +92,7 @@ C<system> operator. See L<perlvar> for details. (Contributed by Gisle Aas.) This variable controls the state of the internal UTF-8 offset caching code. 1 for on (the default), 0 for off, -1 to debug the caching code by checking -all its results against linear scans, and panicing on any discrepancy. +all its results against linear scans, and panicking on any discrepancy. =back @@ -1228,7 +1228,7 @@ Hanssen). =item * -L<reverse> function documentation received scalar context examples. +C<reverse> function documentation received scalar context examples. =back @@ -1259,7 +1259,7 @@ L<perlsub> example is updated to use a lexical variable for C<opendir> syntax. L<perlvar> fixes confusion about real GID C<$(> and effective GID C<$)>. Perl thread tutorial example is fixed in section -L<perlthrtut/Queues: Passing Data Around> and L<perlothrtut>. +L<perlthrtut/Queues: Passing Data Around> and L<perlthrtut>. L<perlhack> documentation extensively improved by Jarkko Hietaniemi and others. @@ -1655,7 +1655,7 @@ a C<system> call. =item * -Fixed bug RT #37886, symbolic deferencing was allowed in the argument of +Fixed bug RT #37886, symbolic dereferencing was allowed in the argument of C<defined> even under the influence of C<use strict 'refs'>. =item * @@ -1811,7 +1811,7 @@ The process id is no longer truncated to 16 bits on some Windows platforms =item * -Fixed bug RT #54828 in F<perlio.c> where calling C<binmode> on Win32 and Cgywin +Fixed bug RT #54828 in F<perlio.c> where calling C<binmode> on Win32 and Cygwin may cause a segmentation fault. =back @@ -2317,7 +2317,8 @@ the Perl 5 bugs at http://bugs.perl.org/ If the bug you are reporting has security implications, which make it inappropriate to send to a publicly archived mailing list, then please send it to perl5-security-report@perl.org. This points to a closed subscription -unarchived mailing list, which includes all the core committers, who be able +unarchived mailing list, which includes +all the core committers, who will be able to help assess the impact of issues, figure out a resolution, and help co-ordinate the release of patches to mitigate or fix the problem across all platforms on which Perl is supported. Please only use this address for security diff --git a/gnu/usr.bin/perl/pod/perlboot.pod b/gnu/usr.bin/perl/pod/perlboot.pod index a6b256a0fcc..15b7117823a 100644 --- a/gnu/usr.bin/perl/pod/perlboot.pod +++ b/gnu/usr.bin/perl/pod/perlboot.pod @@ -1,884 +1,12 @@ +=encoding utf8 + =head1 NAME -perlboot - Beginner's Object-Oriented Tutorial +perlboot - This document has been deleted =head1 DESCRIPTION -If you're not familiar with objects from other languages, some of the -other Perl object documentation may be a little daunting, such as -L<perlobj>, a basic reference in using objects, and L<perltoot>, which -introduces readers to the peculiarities of Perl's object system in a -tutorial way. - -So, let's take a different approach, presuming no prior object -experience. It helps if you know about subroutines (L<perlsub>), -references (L<perlref> et. seq.), and packages (L<perlmod>), so become -familiar with those first if you haven't already. - -=head2 If we could talk to the animals... - -Let's let the animals talk for a moment: - - sub Cow::speak { - print "a Cow goes moooo!\n"; - } - sub Horse::speak { - print "a Horse goes neigh!\n"; - } - sub Sheep::speak { - print "a Sheep goes baaaah!\n"; - } - - Cow::speak; - Horse::speak; - Sheep::speak; - -This results in: - - a Cow goes moooo! - a Horse goes neigh! - a Sheep goes baaaah! - -Nothing spectacular here. Simple subroutines, albeit from separate -packages, and called using the full package name. So let's create -an entire pasture: - - # Cow::speak, Horse::speak, Sheep::speak as before - @pasture = qw(Cow Cow Horse Sheep Sheep); - foreach $animal (@pasture) { - &{$animal."::speak"}; - } - -This results in: - - a Cow goes moooo! - a Cow goes moooo! - a Horse goes neigh! - a Sheep goes baaaah! - a Sheep goes baaaah! - -Wow. That symbolic coderef de-referencing there is pretty nasty. -We're counting on C<no strict refs> mode, certainly not recommended -for larger programs. And why was that necessary? Because the name of -the package seems to be inseparable from the name of the subroutine we -want to invoke within that package. - -Or is it? - -=head2 Introducing the method invocation arrow - -For now, let's say that C<< Class->method >> invokes subroutine -C<method> in package C<Class>. (Here, "Class" is used in its -"category" meaning, not its "scholastic" meaning.) That's not -completely accurate, but we'll do this one step at a time. Now let's -use it like so: - - # Cow::speak, Horse::speak, Sheep::speak as before - Cow->speak; - Horse->speak; - Sheep->speak; - -And once again, this results in: - - a Cow goes moooo! - a Horse goes neigh! - a Sheep goes baaaah! - -That's not fun yet. Same number of characters, all constant, no -variables. But yet, the parts are separable now. Watch: - - $a = "Cow"; - $a->speak; # invokes Cow->speak - -Ahh! Now that the package name has been parted from the subroutine -name, we can use a variable package name. And this time, we've got -something that works even when C<use strict refs> is enabled. - -=head2 Invoking a barnyard - -Let's take that new arrow invocation and put it back in the barnyard -example: - - sub Cow::speak { - print "a Cow goes moooo!\n"; - } - sub Horse::speak { - print "a Horse goes neigh!\n"; - } - sub Sheep::speak { - print "a Sheep goes baaaah!\n"; - } - - @pasture = qw(Cow Cow Horse Sheep Sheep); - foreach $animal (@pasture) { - $animal->speak; - } - -There! Now we have the animals all talking, and safely at that, -without the use of symbolic coderefs. - -But look at all that common code. Each of the C<speak> routines has a -similar structure: a C<print> operator and a string that contains -common text, except for two of the words. It'd be nice if we could -factor out the commonality, in case we decide later to change it all -to C<says> instead of C<goes>. - -And we actually have a way of doing that without much fuss, but we -have to hear a bit more about what the method invocation arrow is -actually doing for us. - -=head2 The extra parameter of method invocation - -The invocation of: - - Class->method(@args) - -attempts to invoke subroutine C<Class::method> as: - - Class::method("Class", @args); - -(If the subroutine can't be found, "inheritance" kicks in, but we'll -get to that later.) This means that we get the class name as the -first parameter (the only parameter, if no arguments are given). So -we can rewrite the C<Sheep> speaking subroutine as: - - sub Sheep::speak { - my $class = shift; - print "a $class goes baaaah!\n"; - } - -And the other two animals come out similarly: - - sub Cow::speak { - my $class = shift; - print "a $class goes moooo!\n"; - } - sub Horse::speak { - my $class = shift; - print "a $class goes neigh!\n"; - } - -In each case, C<$class> will get the value appropriate for that -subroutine. But once again, we have a lot of similar structure. Can -we factor that out even further? Yes, by calling another method in -the same class. - -=head2 Calling a second method to simplify things - -Let's call out from C<speak> to a helper method called C<sound>. -This method provides the constant text for the sound itself. - - { package Cow; - sub sound { "moooo" } - sub speak { - my $class = shift; - print "a $class goes ", $class->sound, "!\n"; - } - } - -Now, when we call C<< Cow->speak >>, we get a C<$class> of C<Cow> in -C<speak>. This in turn selects the C<< Cow->sound >> method, which -returns C<moooo>. But how different would this be for the C<Horse>? - - { package Horse; - sub sound { "neigh" } - sub speak { - my $class = shift; - print "a $class goes ", $class->sound, "!\n"; - } - } - -Only the name of the package and the specific sound change. So can we -somehow share the definition for C<speak> between the Cow and the -Horse? Yes, with inheritance! - -=head2 Inheriting the windpipes - -We'll define a common subroutine package called C<Animal>, with the -definition for C<speak>: - - { package Animal; - sub speak { - my $class = shift; - print "a $class goes ", $class->sound, "!\n"; - } - } - -Then, for each animal, we say it "inherits" from C<Animal>, along -with the animal-specific sound: - - { package Cow; - @ISA = qw(Animal); - sub sound { "moooo" } - } - -Note the added C<@ISA> array (pronounced "is a"). We'll get to that in a minute. - -But what happens when we invoke C<< Cow->speak >> now? - -First, Perl constructs the argument list. In this case, it's just -C<Cow>. Then Perl looks for C<Cow::speak>. But that's not there, so -Perl checks for the inheritance array C<@Cow::ISA>. It's there, -and contains the single name C<Animal>. - -Perl next checks for C<speak> inside C<Animal> instead, as in -C<Animal::speak>. And that's found, so Perl invokes that subroutine -with the already frozen argument list. - -Inside the C<Animal::speak> subroutine, C<$class> becomes C<Cow> (the -first argument). So when we get to the step of invoking -C<< $class->sound >>, it'll be looking for C<< Cow->sound >>, which -gets it on the first try without looking at C<@ISA>. Success! - -=head2 A few notes about @ISA - -This magical C<@ISA> variable has declared that C<Cow> "is a" C<Animal>. -Note that it's an array, not a simple single value, because on rare -occasions, it makes sense to have more than one parent class searched -for the missing methods. - -If C<Animal> also had an C<@ISA>, then we'd check there too. The -search is recursive, depth-first, left-to-right in each C<@ISA> by -default (see L<mro> for alternatives). Typically, each C<@ISA> has -only one element (multiple elements means multiple inheritance and -multiple headaches), so we get a nice tree of inheritance. - -When we turn on C<use strict>, we'll get complaints on C<@ISA>, since -it's not a variable containing an explicit package name, nor is it a -lexical ("my") variable. We can't make it a lexical variable though -(it has to belong to the package to be found by the inheritance mechanism), -so there's a couple of straightforward ways to handle that. - -The easiest is to just spell the package name out: - - @Cow::ISA = qw(Animal); - -Or declare it as package global variable: - - package Cow; - our @ISA = qw(Animal); - -Or allow it as an implicitly named package variable: - - package Cow; - use vars qw(@ISA); - @ISA = qw(Animal); - -If the C<Animal> class comes from another (object-oriented) module, then -just employ C<use base> to specify that C<Animal> should serve as the basis -for the C<Cow> class: - - package Cow; - use base qw(Animal); - -Now that's pretty darn simple! - -=head2 Overriding the methods - -Let's add a mouse, which can barely be heard: - - # Animal package from before - { package Mouse; - @ISA = qw(Animal); - sub sound { "squeak" } - sub speak { - my $class = shift; - print "a $class goes ", $class->sound, "!\n"; - print "[but you can barely hear it!]\n"; - } - } - - Mouse->speak; - -which results in: - - a Mouse goes squeak! - [but you can barely hear it!] - -Here, C<Mouse> has its own speaking routine, so C<< Mouse->speak >> -doesn't immediately invoke C<< Animal->speak >>. This is known as -"overriding". In fact, we don't even need to say that a C<Mouse> is -an C<Animal> at all, because all of the methods needed for C<speak> are -completely defined for C<Mouse>; this is known as "duck typing": -"If it walks like a duck and quacks like a duck, I would call it a duck" -(James Whitcomb). However, it would probably be beneficial to allow a -closer examination to conclude that a C<Mouse> is indeed an C<Animal>, -so it is actually better to define C<Mouse> with C<Animal> as its base -(that is, it is better to "derive C<Mouse> from C<Animal>"). - -Moreover, this duplication of code could become a maintenance headache -(though code-reuse is not actually a good reason for inheritance; good -design practices dictate that a derived class should be usable wherever -its base class is usable, which might not be the outcome if code-reuse -is the sole criterion for inheritance. Just remember that a C<Mouse> -should always act like an C<Animal>). - -So, let's make C<Mouse> an C<Animal>! - -The obvious solution is to invoke C<Animal::speak> directly: - - # Animal package from before - { package Mouse; - @ISA = qw(Animal); - sub sound { "squeak" } - sub speak { - my $class = shift; - Animal::speak($class); - print "[but you can barely hear it!]\n"; - } - } - -Note that we're using C<Animal::speak>. If we were to invoke -C<< Animal->speak >> instead, the first parameter to C<Animal::speak> -would automatically be C<"Animal"> rather than C<"Mouse">, so that -the call to C<< $class->sound >> in C<Animal::speak> would become -C<< Animal->sound >> rather than C<< Mouse->sound >>. - -Also, without the method arrow C<< -> >>, it becomes necessary to specify -the first parameter to C<Animal::speak> ourselves, which is why C<$class> -is explicitly passed: C<Animal::speak($class)>. - -However, invoking C<Animal::speak> directly is a mess: Firstly, it assumes -that the C<speak> method is a member of the C<Animal> class; what if C<Animal> -actually inherits C<speak> from its own base? Because we are no longer using -C<< -> >> to access C<speak>, the special method look up mechanism wouldn't be -used, so C<speak> wouldn't even be found! - -The second problem is more subtle: C<Animal> is now hardwired into the subroutine -selection. Let's assume that C<Animal::speak> does exist. What happens when, -at a later time, someone expands the class hierarchy by having C<Mouse> -inherit from C<Mus> instead of C<Animal>. Unless the invocation of C<Animal::speak> -is also changed to an invocation of C<Mus::speak>, centuries worth of taxonomical -classification could be obliterated! - -What we have here is a fragile or leaky abstraction; it is the beginning of a -maintenance nightmare. What we need is the ability to search for the right -method wih as few assumptions as possible. - -=head2 Starting the search from a different place - -A I<better> solution is to tell Perl where in the inheritance chain to begin searching -for C<speak>. This can be achieved with a modified version of the method arrow C<< -> >>: - - ClassName->FirstPlaceToLook::method - -So, the improved C<Mouse> class is: - - # same Animal as before - { package Mouse; - # same @ISA, &sound as before - sub speak { - my $class = shift; - $class->Animal::speak; - print "[but you can barely hear it!]\n"; - } - } - -Using this syntax, we start with C<Animal> to find C<speak>, and then -use all of C<Animal>'s inheritance chain if it is not found immediately. -As usual, the first parameter to C<speak> would be C<$class>, so we no -longer need to pass C<$class> explicitly to C<speak>. - -But what about the second problem? We're still hardwiring C<Animal> into -the method lookup. - -=head2 The SUPER way of doing things - -If C<Animal> is replaced with the special placeholder C<SUPER> in that -invocation, then the contents of C<Mouse>'s C<@ISA> are used for the -search, beginning with C<$ISA[0]>. So, all of the problems can be fixed -as follows: - - # same Animal as before - { package Mouse; - # same @ISA, &sound as before - sub speak { - my $class = shift; - $class->SUPER::speak; - print "[but you can barely hear it!]\n"; - } - } - -In general, C<SUPER::speak> means look in the current package's C<@ISA> -for a class that implements C<speak>, and invoke the first one found. -The placeholder is called C<SUPER>, because many other languages refer -to base classes as "I<super>classes", and Perl likes to be eclectic. - -Note that a call such as - - $class->SUPER::method; - -does I<not> look in the C<@ISA> of C<$class> unless C<$class> happens to -be the current package. - -=head2 Let's review... - -So far, we've seen the method arrow syntax: - - Class->method(@args); - -or the equivalent: - - $a = "Class"; - $a->method(@args); - -which constructs an argument list of: - - ("Class", @args) - -and attempts to invoke: - - Class::method("Class", @args); - -However, if C<Class::method> is not found, then C<@Class::ISA> is examined -(recursively) to locate a class (a package) that does indeed contain C<method>, -and that subroutine is invoked instead. - -Using this simple syntax, we have class methods, (multiple) inheritance, -overriding, and extending. Using just what we've seen so far, we've -been able to factor out common code (though that's never a good reason -for inheritance!), and provide a nice way to reuse implementations with -variations. - -Now, what about data? - -=head2 A horse is a horse, of course of course, or is it? - -Let's start with the code for the C<Animal> class -and the C<Horse> class: - - { package Animal; - sub speak { - my $class = shift; - print "a $class goes ", $class->sound, "!\n"; - } - } - { package Horse; - @ISA = qw(Animal); - sub sound { "neigh" } - } - -This lets us invoke C<< Horse->speak >> to ripple upward to -C<Animal::speak>, calling back to C<Horse::sound> to get the specific -sound, and the output of: - - a Horse goes neigh! - -But all of our Horse objects would have to be absolutely identical. -If we add a subroutine, all horses automatically share it. That's -great for making horses the same, but how do we capture the -distinctions of an individual horse? For example, suppose we want -to give our first horse a name. There's got to be a way to keep its -name separate from the other horses. - -That is to say, we want particular instances of C<Horse> to have -different names. - -In Perl, any reference can be an "instance", so let's start with the -simplest reference that can hold a horse's name: a scalar reference. - - my $name = "Mr. Ed"; - my $horse = \$name; - -So, now C<$horse> is a reference to what will be the instance-specific -data (the name). The final step is to turn this reference into a real -instance of a C<Horse> by using the special operator C<bless>: - - bless $horse, Horse; - -This operator stores information about the package named C<Horse> into -the thing pointed at by the reference. At this point, we say -C<$horse> is an instance of C<Horse>. That is, it's a specific -horse. The reference is otherwise unchanged, and can still be used -with traditional dereferencing operators. - -=head2 Invoking an instance method - -The method arrow can be used on instances, as well as classes (the names -of packages). So, let's get the sound that C<$horse> makes: - - my $noise = $horse->sound("some", "unnecessary", "args"); - -To invoke C<sound>, Perl first notes that C<$horse> is a blessed -reference (and thus an instance). It then constructs an argument -list, as per usual. - -Now for the fun part: Perl takes the class in which the instance was -blessed, in this case C<Horse>, and uses that class to locate the -subroutine. In this case, C<Horse::sound> is found directly (without -using inheritance). In the end, it is as though our initial line were -written as follows: - - my $noise = Horse::sound($horse, "some", "unnecessary", "args"); - -Note that the first parameter here is still the instance, not the name -of the class as before. We'll get C<neigh> as the return value, and -that'll end up as the C<$noise> variable above. - -If Horse::sound had not been found, we'd be wandering up the C<@Horse::ISA> -array, trying to find the method in one of the superclasses. The only -difference between a class method and an instance method is whether the -first parameter is an instance (a blessed reference) or a class name (a -string). - -=head2 Accessing the instance data - -Because we get the instance as the first parameter, we can now access -the instance-specific data. In this case, let's add a way to get at -the name: - - { package Horse; - @ISA = qw(Animal); - sub sound { "neigh" } - sub name { - my $self = shift; - $$self; - } - } - -Inside C<Horse::name>, the C<@_> array contains: - - ($horse, "some", "unnecessary", "args") - -so the C<shift> stores C<$horse> into C<$self>. Then, C<$self> gets -de-referenced with C<$$self> as normal, yielding C<"Mr. Ed">. - -It's traditional to C<shift> the first parameter into a variable named -C<$self> for instance methods and into a variable named C<$class> for -class methods. - -Then, the following line: - - print $horse->name, " says ", $horse->sound, "\n"; - -outputs: - - Mr. Ed says neigh. - -=head2 How to build a horse - -Of course, if we constructed all of our horses by hand, we'd most -likely make mistakes from time to time. We're also violating one of -the properties of object-oriented programming, in that the "inside -guts" of a Horse are visible. That's good if you're a veterinarian, -but not if you just like to own horses. So, let's have the Horse -class handle the details inside a class method: - - { package Horse; - @ISA = qw(Animal); - sub sound { "neigh" } - sub name { - my $self = shift; # instance method, so use $self - $$self; - } - sub named { - my $class = shift; # class method, so use $class - my $name = shift; - bless \$name, $class; - } - } - -Now with the new C<named> method, we can build a horse as follows: - - my $horse = Horse->named("Mr. Ed"); - -Notice we're back to a class method, so the two arguments to -C<Horse::named> are C<Horse> and C<Mr. Ed>. The C<bless> operator -not only blesses C<\$name>, it also returns that reference. - -This C<Horse::named> method is called a "constructor". - -We've called the constructor C<named> here, so that it quickly denotes -the constructor's argument as the name for this particular C<Horse>. -You can use different constructors with different names for different -ways of "giving birth" to the object (like maybe recording its -pedigree or date of birth). However, you'll find that most people -coming to Perl from more limited languages use a single constructor -named C<new>, with various ways of interpreting the arguments to -C<new>. Either style is fine, as long as you document your particular -way of giving birth to an object. (And you I<were> going to do that, -right?) - -=head2 Inheriting the constructor - -But was there anything specific to C<Horse> in that method? No. Therefore, -it's also the same recipe for building anything else that inherited from -C<Animal>, so let's put C<name> and C<named> there: - - { package Animal; - sub speak { - my $class = shift; - print "a $class goes ", $class->sound, "!\n"; - } - sub name { - my $self = shift; - $$self; - } - sub named { - my $class = shift; - my $name = shift; - bless \$name, $class; - } - } - { package Horse; - @ISA = qw(Animal); - sub sound { "neigh" } - } - -Ahh, but what happens if we invoke C<speak> on an instance? - - my $horse = Horse->named("Mr. Ed"); - $horse->speak; - -We get a debugging value: - - a Horse=SCALAR(0xaca42ac) goes neigh! - -Why? Because the C<Animal::speak> routine is expecting a classname as -its first parameter, not an instance. When the instance is passed in, -we'll end up using a blessed scalar reference as a string, and that -shows up as we saw it just now. - -=head2 Making a method work with either classes or instances - -All we need is for a method to detect if it is being called on a class -or called on an instance. The most straightforward way is with the -C<ref> operator. This returns a string (the classname) when used on a -blessed reference, and an empty string when used on a string (like a -classname). Let's modify the C<name> method first to notice the change: - - sub name { - my $either = shift; - ref $either ? $$either : "Any $either"; - } - -Here, the C<?:> operator comes in handy to select either the -dereference or a derived string. Now we can use this with either an -instance or a class. Note that I've changed the first parameter -holder to C<$either> to show that this is intended: - - my $horse = Horse->named("Mr. Ed"); - print Horse->name, "\n"; # prints "Any Horse\n" - print $horse->name, "\n"; # prints "Mr Ed.\n" - -and now we'll fix C<speak> to use this: - - sub speak { - my $either = shift; - print $either->name, " goes ", $either->sound, "\n"; - } - -And since C<sound> already worked with either a class or an instance, -we're done! - -=head2 Adding parameters to a method - -Let's train our animals to eat: - - { package Animal; - sub named { - my $class = shift; - my $name = shift; - bless \$name, $class; - } - sub name { - my $either = shift; - ref $either ? $$either : "Any $either"; - } - sub speak { - my $either = shift; - print $either->name, " goes ", $either->sound, "\n"; - } - sub eat { - my $either = shift; - my $food = shift; - print $either->name, " eats $food.\n"; - } - } - { package Horse; - @ISA = qw(Animal); - sub sound { "neigh" } - } - { package Sheep; - @ISA = qw(Animal); - sub sound { "baaaah" } - } - -And now try it out: - - my $horse = Horse->named("Mr. Ed"); - $horse->eat("hay"); - Sheep->eat("grass"); - -which prints: - - Mr. Ed eats hay. - Any Sheep eats grass. - -An instance method with parameters gets invoked with the instance, -and then the list of parameters. So that first invocation is like: - - Animal::eat($horse, "hay"); - -=head2 More interesting instances - -What if an instance needs more data? Most interesting instances are -made of many items, each of which can in turn be a reference or even -another object. The easiest way to store these is often in a hash. -The keys of the hash serve as the names of parts of the object (often -called "instance variables" or "member variables"), and the -corresponding values are, well, the values. - -But how do we turn the horse into a hash? Recall that an object was -any blessed reference. We can just as easily make it a blessed hash -reference as a blessed scalar reference, as long as everything that -looks at the reference is changed accordingly. - -Let's make a sheep that has a name and a color: - - my $bad = bless { Name => "Evil", Color => "black" }, Sheep; - -so C<< $bad->{Name} >> has C<Evil>, and C<< $bad->{Color} >> has -C<black>. But we want to make C<< $bad->name >> access the name, and -that's now messed up because it's expecting a scalar reference. Not -to worry, because that's pretty easy to fix up. - -One solution is to override C<Animal::name> and C<Animal::named> by -defining them anew in C<Sheep>, but then any methods added later to -C<Animal> might still mess up, and we'd have to override all of those -too. Therefore, it's never a good idea to define the data layout in a -way that's different from the data layout of the base classes. In fact, -it's a good idea to use blessed hash references in all cases. Also, this -is why it's important to have constructors do the low-level work. So, -let's redefine C<Animal>: - - ## in Animal - sub name { - my $either = shift; - ref $either ? $either->{Name} : "Any $either"; - } - sub named { - my $class = shift; - my $name = shift; - my $self = { Name => $name }; - bless $self, $class; - } - -Of course, we still need to override C<named> in order to handle -constructing a C<Sheep> with a certain color: - - ## in Sheep - sub named { - my ($class, $name) = @_; - my $self = $class->SUPER::named(@_); - $$self{Color} = $class->default_color; - $self - } - -(Note that C<@_> contains the parameters to C<named>.) - -What's this C<default_color>? Well, if C<named> has only the name, -we still need to set a color, so we'll have a class-specific default color. -For a sheep, we might define it as white: - - ## in Sheep - sub default_color { "white" } - -Now: - - my $sheep = Sheep->named("Bad"); - print $sheep->{Color}, "\n"; - -outputs: - - white - -Now, there's nothing particularly specific to C<Sheep> when it comes -to color, so let's remove C<Sheep::named> and implement C<Animal::named> -to handle color instead: - - ## in Animal - sub named { - my ($class, $name) = @_; - my $self = { Name => $name, Color => $class->default_color }; - bless $self, $class; - } - -And then to keep from having to define C<default_color> for each additional -class, we'll define a method that serves as the "default default" directly -in C<Animal>: - - ## in Animal - sub default_color { "brown" } - -Of course, because C<name> and C<named> were the only methods that -referenced the "structure" of the object, the rest of the methods can -remain the same, so C<speak> still works as before. - -=head2 A horse of a different color - -But having all our horses be brown would be boring. So let's add a -method or two to get and set the color. - - ## in Animal - sub color { - $_[0]->{Color} - } - sub set_color { - $_[0]->{Color} = $_[1]; - } - -Note the alternate way of accessing the arguments: C<$_[0]> is used -in-place, rather than with a C<shift>. (This saves us a bit of time -for something that may be invoked frequently.) And now we can fix -that color for Mr. Ed: - - my $horse = Horse->named("Mr. Ed"); - $horse->set_color("black-and-white"); - print $horse->name, " is colored ", $horse->color, "\n"; - -which results in: - - Mr. Ed is colored black-and-white - -=head2 Summary - -So, now we have class methods, constructors, instance methods, instance -data, and even accessors. But that's still just the beginning of what -Perl has to offer. We haven't even begun to talk about accessors that -double as getters and setters, destructors, indirect object notation, -overloading, "isa" and "can" tests, the C<UNIVERSAL> class, and so on. -That's for the rest of the Perl documentation to cover. Hopefully, this -gets you started, though. - -=head1 SEE ALSO - -For more information, see L<perlobj> (for all the gritty details about -Perl objects, now that you've seen the basics), L<perltoot> (the -tutorial for those who already know objects), L<perltooc> (dealing -with class data), L<perlbot> (for some more tricks), and books such as -Damian Conway's excellent I<Object Oriented Perl>. - -Some modules which might prove interesting are Class::Accessor, -Class::Class, Class::Contract, Class::Data::Inheritable, -Class::MethodMaker and Tie::SecureHash - -=head1 COPYRIGHT - -Copyright (c) 1999, 2000 by Randal L. Schwartz and Stonehenge -Consulting Services, Inc. - -Copyright (c) 2009 by Michael F. Witten. - -Permission is hereby granted to distribute this document intact with -the Perl distribution, and in accordance with the licenses of the Perl -distribution; derived documents must include this copyright notice -intact. - -Portions of this text have been derived from Perl Training materials -originally appearing in the I<Packages, References, Objects, and -Modules> course taught by instructors for Stonehenge Consulting -Services, Inc. and used with permission. +For information on OO programming with Perl, please see L<perlootut> +and L<perlobj>. -Portions of this text have been derived from materials originally -appearing in I<Linux Magazine> and used with permission. +=cut diff --git a/gnu/usr.bin/perl/pod/perlcheat.pod b/gnu/usr.bin/perl/pod/perlcheat.pod index 7f2c83066f6..deee2fecdfb 100644 --- a/gnu/usr.bin/perl/pod/perlcheat.pod +++ b/gnu/usr.bin/perl/pod/perlcheat.pod @@ -10,68 +10,70 @@ already be overwhelming. =head2 The sheet - CONTEXTS SIGILS ARRAYS HASHES - void $scalar whole: @array %hash - scalar @array slice: @array[0, 2] @hash{'a', 'b'} - list %hash element: $array[0] $hash{'a'} - &sub - *glob SCALAR VALUES - number, string, reference, glob, undef + CONTEXTS SIGILS ref ARRAYS HASHES + void $scalar SCALAR @array %hash + scalar @array ARRAY @array[0, 2] @hash{'a', 'b'} + list %hash HASH $array[0] $hash{'a'} + &sub CODE + *glob GLOB SCALAR VALUES + FORMAT number, string, ref, glob, undef REFERENCES - \ references $$foo[1] aka $foo->[1] - $@%&* dereference $$foo{bar} aka $foo->{bar} - [] anon. arrayref ${$$foo[1]}[2] aka $foo->[1]->[2] - {} anon. hashref ${$$foo[1]}[2] aka $foo->[1][2] - \() list of refs - NUMBERS vs STRINGS LINKS - OPERATOR PRECEDENCE = = perl.plover.com - -> + . search.cpan.org - ++ -- == != eq ne cpan.org - ** < > <= >= lt gt le ge pm.org - ! ~ \ u+ u- <=> cmp tpj.com - =~ !~ perldoc.com - * / % x SYNTAX - + - . for (LIST) { }, for (a;b;c) { } - << >> while ( ) { }, until ( ) { } - named uops if ( ) { } elsif ( ) { } else { } - < > <= >= lt gt le ge unless ( ) { } elsif ( ) { } else { } - == != <=> eq ne cmp ~~ for equals foreach (ALWAYS) + \ reference $$foo[1] aka $foo->[1] + $@%&* dereference $$foo{bar} aka $foo->{bar} + [] anon. arrayref ${$$foo[1]}[2] aka $foo->[1]->[2] + {} anon. hashref ${$$foo[1]}[2] aka $foo->[1][2] + \() list of refs + NUMBERS vs STRINGS LINKS + OPERATOR PRECEDENCE = = perldoc.perl.org + -> + . search.cpan.org + ++ -- == != eq ne cpan.org + ** < > <= >= lt gt le ge pm.org + ! ~ \ u+ u- <=> cmp p3rl.org + =~ !~ perlmonks.org + * / % x SYNTAX + + - . foreach (LIST) { } for (a;b;c) { } + << >> while (e) { } until (e) { } + named uops if (e) { } elsif (e) { } else { } + < > <= >= lt gt le ge unless (e) { } elsif (e) { } else { } + == != <=> eq ne cmp ~~ given (e) { when (e) {} default {} } & - | ^ REGEX METACHARS REGEX MODIFIERS - && ^ string begin /i case insens. - || // $ str. end (before \n) /m line based ^$ - .. ... + one or more /s . includes \n - ?: * zero or more /x ign. wh.space - = += -= *= etc. ? zero or one /g global - , => {3,7} repeat in range /o cmpl pat. once - list ops () capture - not (?:) no capture REGEX CHARCLASSES - and [] character class . == [^\n] - or xor | alternation \s == whitespace - \b word boundary \w == word characters - \z string end \d == digits - DO \S, \W and \D negate - use strict; DON'T - use warnings; "$foo" LINKS - my $var; $$variable_name perl.com - open() or die $!; `$userinput` use.perl.org - use Modules; /$userinput/ perl.apache.org - + | ^ REGEX METACHARS REGEX MODIFIERS + && ^ string begin /i case insensitive + || // $ str end (bfr \n) /m line based ^$ + .. ... + one or more /s . includes \n + ?: * zero or more /x ignore wh.space + = += -= *= etc ? zero or one /p preserve + , => {3,7} repeat in range /a ASCII /aa safe + list ops | alternation /l locale /d dual + not [] character class /u Unicode + and \b word boundary /e evaluate /ee rpts + or xor \z string end /g global + () capture /o compile pat once + DEBUG (?:p) no capture + -MO=Deparse (?#t) comment REGEX CHARCLASSES + -MO=Terse (?=p) ZW pos ahead . [^\n] + -D## (?!p) ZW neg ahead \s whitespace + -d:Trace (?<=p) ZW pos behind \K \w word chars + (?<!p) ZW neg behind \d digits + CONFIGURATION (?>p) no backtrack \pP named property + perl -V:ivsize (?|p|p)branch reset \h horiz.wh.space + (?&NM) cap to name \R linebreak + \S \W \D \H negate FUNCTION RETURN LISTS stat localtime caller SPECIAL VARIABLES - 0 dev 0 second 0 package $_ default variable - 1 ino 1 minute 1 filename $0 program name - 2 mode 2 hour 2 line $/ input separator - 3 nlink 3 day 3 subroutine $\ output separator - 4 uid 4 month-1 4 hasargs $| autoflush - 5 gid 5 year-1900 5 wantarray $! sys/libcall error - 6 rdev 6 weekday 6 evaltext $@ eval error - 7 size 7 yearday 7 is_require $$ process ID - 8 atime 8 is_dst 8 hints $. line number - 9 mtime 9 bitmask @ARGV command line args - 10 ctime just use @INC include paths - 11 blksz POSIX:: 3..9 only @_ subroutine args - 12 blcks strftime! with EXPR %ENV environment + 0 dev 0 second 0 package $_ default variable + 1 ino 1 minute 1 filename $0 program name + 2 mode 2 hour 2 line $/ input separator + 3 nlink 3 day 3 subroutine $\ output separator + 4 uid 4 month-1 4 hasargs $| autoflush + 5 gid 5 year-1900 5 wantarray $! sys/libcall error + 6 rdev 6 weekday 6 evaltext $@ eval error + 7 size 7 yearday 7 is_require $$ process ID + 8 atime 8 is_dst 8 hints $. line number + 9 mtime 9 bitmask @ARGV command line args + 10 ctime 10 hinthash @INC include paths + 11 blksz 3..10 only @_ subroutine args + 12 blcks with EXPR %ENV environment =head1 ACKNOWLEDGEMENTS @@ -88,6 +90,18 @@ Juerd Waalboer <#####@juerd.nl>, with the help of many Perl Monks. =head1 SEE ALSO - http://perlmonks.org/?node_id=216602 the original PM post - http://perlmonks.org/?node_id=238031 Damian Conway's Perl 6 version - http://juerd.nl/site.plp/perlcheat home of the Perl Cheat Sheet +=over 4 + +=item * + +L<http://perlmonks.org/?node_id=216602> - the original PM post + +=item * + +L<http://perlmonks.org/?node_id=238031> - Damian Conway's Perl 6 version + +=item * + +L<http://juerd.nl/site.plp/perlcheat> - home of the Perl Cheat Sheet + +=back diff --git a/gnu/usr.bin/perl/pod/perlclib.pod b/gnu/usr.bin/perl/pod/perlclib.pod index 1fe4699be1b..0785577dace 100644 --- a/gnu/usr.bin/perl/pod/perlclib.pod +++ b/gnu/usr.bin/perl/pod/perlclib.pod @@ -138,7 +138,7 @@ pattern into it that should be illegal as pointers (and floating point numbers), and also hopefully surprising enough as integers, so that any code attempting to use the data without forethought will break sooner rather than later. Poisoning can be done using the Poison() -macros, which have similar arguments as Zero(): +macros, which have similar arguments to Zero(): PoisonWith(dst, n, t, b) scribble memory with byte b PoisonNew(dst, n, t) equal to PoisonWith(dst, n, t, 0xAB) diff --git a/gnu/usr.bin/perl/pod/perlcommunity.pod b/gnu/usr.bin/perl/pod/perlcommunity.pod index 8430c3fcc10..96c7b85486e 100644 --- a/gnu/usr.bin/perl/pod/perlcommunity.pod +++ b/gnu/usr.bin/perl/pod/perlcommunity.pod @@ -14,15 +14,15 @@ evidence that the Perl users apply TMTOWTDI to all endeavors, not just programming. From websites, to IRC, to mailing lists, there is more than one way to get involved in the community. -=head2 Where to find the community +=head2 Where to Find the Community There is a central directory for the Perl community: L<http://perl.org> maintained by the Perl Foundation (L<http://www.perlfoundation.org/>), which tracks and provides services for a variety of other community sites. -=head2 Mailing lists and Newsgroups +=head2 Mailing Lists and Newsgroups -Perl runs on e-mail, there is no doubt about it. The Camel book was originally +Perl runs on e-mail; there is no doubt about it. The Camel book was originally written mostly over e-mail and today Perl's development is co-ordinated through mailing lists. The largest repository of Perl mailing lists is located at L<http://lists.perl.org>. @@ -41,7 +41,7 @@ The Perl community has a rather large IRC presence. For starters, it has its own IRC network, L<irc://irc.perl.org>. General (not help-oriented) chat can be found at L<irc://irc.perl.org/#perl>. Many other more specific chats are also hosted on the network. Information about irc.perl.org is located on the -network's website: L<http://www.irc.perl.org>. For a more help oriented #perl, +network's website: L<http://www.irc.perl.org>. For a more help-oriented #perl, check out L<irc://irc.freenode.net/#perl>. Perl 6 development also has a presence in L<irc://irc.freenode.net/#perl6>. Most Perl-related channels will be kind enough to point you in the right direction if you ask nicely. @@ -52,7 +52,7 @@ with varying activity levels. =head2 Websites Perl websites come in a variety of forms, but they fit into two large -categories: forums and news websites. There are many Perl related +categories: forums and news websites. There are many Perl-related websites, so only a few of the community's largest are mentioned here. =head3 News sites @@ -61,7 +61,7 @@ websites, so only a few of the community's largest are mentioned here. =item L<http://perl.com/> -Run by O'Reilly Media (The publisher of L<the Camel Book|perlbook> among other +Run by O'Reilly Media (the publisher of L<the Camel Book|perlbook>, among other Perl-related literature), perl.com provides current Perl news, articles, and resources for Perl developers as well as a directory of other useful websites. @@ -119,7 +119,7 @@ L<irc://irc.perl.org/#perl>. If you have never been to a hackathon, here are a few basic things you need to know before attending: have a working laptop and know how to use it; check out -the involved projects before hand; have the necessary version control client; +the involved projects beforehand; have the necessary version control client; and bring backup equipment (an extra LAN cable, additional power strips, etc.) because someone will forget. diff --git a/gnu/usr.bin/perl/pod/perldbmfilter.pod b/gnu/usr.bin/perl/pod/perldbmfilter.pod index e58ce2013be..0413bf95f8c 100644 --- a/gnu/usr.bin/perl/pod/perldbmfilter.pod +++ b/gnu/usr.bin/perl/pod/perldbmfilter.pod @@ -17,7 +17,7 @@ The four C<filter_*> methods shown above are available in all the DBM modules that ship with Perl, namely DB_File, GDBM_File, NDBM_File, ODBM_File and SDBM_File. -Each of the methods work identically, and are used to install (or +Each of the methods works identically, and is used to install (or uninstall) a single DBM Filter. The only difference between them is the place that the filter is installed. @@ -35,7 +35,6 @@ every time you write a key to a DBM database. If a filter has been installed with this method, it will be invoked every time you write a value to a DBM database. - =item B<filter_fetch_key> If a filter has been installed with this method, it will be invoked @@ -51,7 +50,7 @@ every time you read a value from a DBM database. You can use any combination of the methods from none to all four. All filter methods return the existing filter, if present, or C<undef> -in not. +if not. To delete a filter pass C<undef> to it. diff --git a/gnu/usr.bin/perl/pod/perldebguts.pod b/gnu/usr.bin/perl/pod/perldebguts.pod index 7319e748112..8ae6e7baa96 100644 --- a/gnu/usr.bin/perl/pod/perldebguts.pod +++ b/gnu/usr.bin/perl/pod/perldebguts.pod @@ -53,7 +53,7 @@ C<"$break_condition\0$action">. The same holds for evaluated strings that contain subroutines, or which are currently being executed. The $filename for C<eval>ed strings -looks like C<(eval 34)> or C<(re_eval 19)>. +looks like C<(eval 34)> or C<(re_eval 19)>. =item * @@ -151,9 +151,9 @@ after the debugger completes its own initialization.) After the rc file is read, the debugger reads the PERLDB_OPTS environment variable and uses it to set debugger options. The contents of this variable are treated as if they were the argument -of an C<o ...> debugger command (q.v. in L<perldebug/Options>). +of an C<o ...> debugger command (q.v. in L<perldebug/"Configurable Options">). -=head3 Debugger internal variables +=head3 Debugger Internal Variables In addition to the file and subroutine-related variables mentioned above, the debugger also maintains various magical internal variables. @@ -172,7 +172,7 @@ equal to zero only if the line is not breakable. =item * -C<%DB::dbline>, is an alias for C<%{"::_<current_file"}>, which +C<%DB::dbline> is an alias for C<%{"::_<current_file"}>, which contains breakpoints and actions keyed by line number in the currently-selected file, either explicitly chosen with the debugger's C<f> command, or implicitly by flow of execution. @@ -184,7 +184,7 @@ C<"$break_condition\0$action">. =back -=head3 Debugger customization functions +=head3 Debugger Customization Functions Some functions are provided to simplify customization. @@ -227,7 +227,7 @@ information. For example, contrast this expression trace: Loading DB routines from perl5db.pl patch level 0.94 Emacs support available. - Enter h or `h h' for help. + Enter h or 'h h' for help. main::(-e:1): 0 DB<1> sub foo { 14 } @@ -257,8 +257,8 @@ with this one, once the C<o>ption C<frame=2> has been set: By way of demonstration, we present below a laborious listing resulting from setting your C<PERLDB_OPTS> environment variable to the value C<f=n N>, and running I<perl -d -V> from the command line. -Examples use various values of C<n> are shown to give you a feel -for the difference between settings. Long those it may be, this +Examples using various values of C<n> are shown to give you a feel +for the difference between settings. Long though it may be, this is not a complete listing, but only excerpts. =over 4 @@ -397,7 +397,7 @@ When a package is compiled, a line like this is printed with proper indentation. -=head1 Debugging regular expressions +=head1 Debugging Regular Expressions There are two ways to enable debugging output for regular expressions. @@ -405,13 +405,14 @@ If your perl is compiled with C<-DDEBUGGING>, you may use the B<-Dr> flag on the command line. Otherwise, one can C<use re 'debug'>, which has effects at -compile time and run time. It is not lexically scoped. +compile time and run time. Since Perl 5.9.5, this pragma is lexically +scoped. -=head2 Compile-time output +=head2 Compile-time Output The debugging output at compile time looks like this: - Compiling REx `[bc]d(ef*g)+h[ij]k$' + Compiling REx '[bc]d(ef*g)+h[ij]k$' size 45 Got 364 bytes for offset annotations. first at 1 rarest char g at 0 @@ -432,8 +433,8 @@ The debugging output at compile time looks like this: 42: EXACT <k>(44) 44: EOL(45) 45: END(0) - anchored `de' at 1 floating `gh' at 3..2147483647 (checking floating) - stclass `ANYOF[bc]' minlen 7 + anchored 'de' at 1 floating 'gh' at 3..2147483647 (checking floating) + stclass 'ANYOF[bc]' minlen 7 Offsets: [45] 1[4] 0[0] 0[0] 0[0] 0[0] 0[0] 0[0] 0[0] 0[0] 0[0] 0[0] 5[1] 0[0] 12[1] 0[0] 6[1] 0[0] 7[1] 0[0] 9[1] 8[1] 0[0] 10[1] 0[0] @@ -449,8 +450,8 @@ label I<id> of the first node that does a match. The - anchored `de' at 1 floating `gh' at 3..2147483647 (checking floating) - stclass `ANYOF[bc]' minlen 7 + anchored 'de' at 1 floating 'gh' at 3..2147483647 (checking floating) + stclass 'ANYOF[bc]' minlen 7 line (split into two lines above) contains optimizer information. In the example shown, the optimizer found that the match @@ -513,13 +514,13 @@ C<(??{ code })>. =item C<anchored(TYPE)> -If the pattern may match only at a handful of places, (with C<TYPE> +If the pattern may match only at a handful of places, with C<TYPE> being C<BOL>, C<MBOL>, or C<GPOS>. See the table below. =back If a substring is known to match at end-of-line only, it may be -followed by C<$>, as in C<floating `k'$>. +followed by C<$>, as in C<floating 'k'$>. The optimizer-specific information is used to avoid entering (a slow) regex engine on strings that will not definitely match. If the C<isall> flag @@ -531,119 +532,240 @@ form of the regex. Each line has format C< >I<id>: I<TYPE> I<OPTIONAL-INFO> (I<next-id>) -=head2 Types of nodes +=head2 Types of Nodes Here are the possible types, with short descriptions: - # TYPE arg-description [num-args] [longjump-len] DESCRIPTION - - # Exit points - END no End of program. - SUCCEED no Return from a subroutine, basically. - - # Anchors: - BOL no Match "" at beginning of line. - MBOL no Same, assuming multiline. - SBOL no Same, assuming singleline. - EOS no Match "" at end of string. - EOL no Match "" at end of line. - MEOL no Same, assuming multiline. - SEOL no Same, assuming singleline. - BOUND no Match "" at any word boundary - BOUNDL no Match "" at any word boundary - NBOUND no Match "" at any word non-boundary - NBOUNDL no Match "" at any word non-boundary - GPOS no Matches where last m//g left off. - - # [Special] alternatives - ANY no Match any one character (except newline). - SANY no Match any one character. - ANYOF sv Match character in (or not in) this class. - ALNUM no Match any alphanumeric character - ALNUML no Match any alphanumeric char in locale - NALNUM no Match any non-alphanumeric character - NALNUML no Match any non-alphanumeric char in locale - SPACE no Match any whitespace character - SPACEL no Match any whitespace char in locale - NSPACE no Match any non-whitespace character - NSPACEL no Match any non-whitespace char in locale - DIGIT no Match any numeric character - NDIGIT no Match any non-numeric character - - # BRANCH The set of branches constituting a single choice are hooked - # together with their "next" pointers, since precedence prevents - # anything being concatenated to any individual branch. The - # "next" pointer of the last BRANCH in a choice points to the - # thing following the whole choice. This is also where the - # final "next" pointer of each individual branch points; each - # branch starts with the operand node of a BRANCH node. - # - BRANCH node Match this alternative, or the next... - - # BACK Normal "next" pointers all implicitly point forward; BACK - # exists to make loop structures possible. - # not used - BACK no Match "", "next" ptr points backward. - - # Literals - EXACT sv Match this string (preceded by length). - EXACTF sv Match this string, folded (prec. by length). - EXACTFL sv Match this string, folded in locale (w/len). - - # Do nothing - NOTHING no Match empty string. - # A variant of above which delimits a group, thus stops optimizations - TAIL no Match empty string. Can jump here from outside. - - # STAR,PLUS '?', and complex '*' and '+', are implemented as circular - # BRANCH structures using BACK. Simple cases (one character - # per match) are implemented with STAR and PLUS for speed - # and to minimize recursive plunges. - # - STAR node Match this (simple) thing 0 or more times. - PLUS node Match this (simple) thing 1 or more times. - - CURLY sv 2 Match this simple thing {n,m} times. - CURLYN no 2 Match next-after-this simple thing - # {n,m} times, set parens. - CURLYM no 2 Match this medium-complex thing {n,m} times. - CURLYX sv 2 Match this complex thing {n,m} times. - - # This terminator creates a loop structure for CURLYX - WHILEM no Do curly processing and see if rest matches. - - # OPEN,CLOSE,GROUPP ...are numbered at compile time. - OPEN num 1 Mark this point in input as start of #n. - CLOSE num 1 Analogous to OPEN. - - REF num 1 Match some already matched string - REFF num 1 Match already matched string, folded - REFFL num 1 Match already matched string, folded in loc. - - # grouping assertions - IFMATCH off 1 2 Succeeds if the following matches. - UNLESSM off 1 2 Fails if the following matches. - SUSPEND off 1 1 "Independent" sub-regex. - IFTHEN off 1 1 Switch, should be preceded by switcher . - GROUPP num 1 Whether the group matched. - - # Support for long regex - LONGJMP off 1 1 Jump far away. - BRANCHJ off 1 1 BRANCH with long offset. - - # The heavy worker - EVAL evl 1 Execute some Perl code. - - # Modifiers - MINMOD no Next operator is not greedy. - LOGICAL no Next opcode should set the flag only. - - # This is not used yet - RENUM off 1 1 Group with independently numbered parens. - - # This is not really a node, but an optimized away piece of a "long" node. - # To simplify debugging output, we mark it as if it were a node - OPTIMIZED off Placeholder for dump. + # TYPE arg-description [num-args] [longjump-len] DESCRIPTION + + # Exit points + END no End of program. + SUCCEED no Return from a subroutine, basically. + + # Anchors: + + BOL no Match "" at beginning of line. + MBOL no Same, assuming multiline. + SBOL no Same, assuming singleline. + EOS no Match "" at end of string. + EOL no Match "" at end of line. + MEOL no Same, assuming multiline. + SEOL no Same, assuming singleline. + BOUND no Match "" at any word boundary using native charset + semantics for non-utf8 + BOUNDL no Match "" at any locale word boundary + BOUNDU no Match "" at any word boundary using Unicode semantics + BOUNDA no Match "" at any word boundary using ASCII semantics + NBOUND no Match "" at any word non-boundary using native charset + semantics for non-utf8 + NBOUNDL no Match "" at any locale word non-boundary + NBOUNDU no Match "" at any word non-boundary using Unicode semantics + NBOUNDA no Match "" at any word non-boundary using ASCII semantics + GPOS no Matches where last m//g left off. + + # [Special] alternatives: + + REG_ANY no Match any one character (except newline). + SANY no Match any one character. + CANY no Match any one byte. + ANYOF sv Match character in (or not in) this class, single char + match only + ANYOFV sv Match character in (or not in) this class, can + match-multiple chars + ALNUM no Match any alphanumeric character using native charset + semantics for non-utf8 + ALNUML no Match any alphanumeric char in locale + ALNUMU no Match any alphanumeric char using Unicode semantics + ALNUMA no Match [A-Za-z_0-9] + NALNUM no Match any non-alphanumeric character using native charset + semantics for non-utf8 + NALNUML no Match any non-alphanumeric char in locale + NALNUMU no Match any non-alphanumeric char using Unicode semantics + NALNUMA no Match [^A-Za-z_0-9] + SPACE no Match any whitespace character using native charset + semantics for non-utf8 + SPACEL no Match any whitespace char in locale + SPACEU no Match any whitespace char using Unicode semantics + SPACEA no Match [ \t\n\f\r] + NSPACE no Match any non-whitespace character using native charset + semantics for non-utf8 + NSPACEL no Match any non-whitespace char in locale + NSPACEU no Match any non-whitespace char using Unicode semantics + NSPACEA no Match [^ \t\n\f\r] + DIGIT no Match any numeric character using native charset semantics + for non-utf8 + DIGITL no Match any numeric character in locale + DIGITA no Match [0-9] + NDIGIT no Match any non-numeric character using native charset + i semantics for non-utf8 + NDIGITL no Match any non-numeric character in locale + NDIGITA no Match [^0-9] + CLUMP no Match any extended grapheme cluster sequence + + # Alternation + + # BRANCH The set of branches constituting a single choice are hooked + # together with their "next" pointers, since precedence prevents + # anything being concatenated to any individual branch. The + # "next" pointer of the last BRANCH in a choice points to the + # thing following the whole choice. This is also where the + # final "next" pointer of each individual branch points; each + # branch starts with the operand node of a BRANCH node. + # + BRANCH node Match this alternative, or the next... + + # Back pointer + + # BACK Normal "next" pointers all implicitly point forward; BACK + # exists to make loop structures possible. + # not used + BACK no Match "", "next" ptr points backward. + + # Literals + + EXACT str Match this string (preceded by length). + EXACTF str Match this string, folded, native charset semantics for + non-utf8 (prec. by length). + EXACTFL str Match this string, folded in locale (w/len). + EXACTFU str Match this string, folded, Unicode semantics for non-utf8 + (prec. by length). + EXACTFA str Match this string, folded, Unicode semantics for non-utf8, + but no ASCII-range character matches outside ASCII (prec. + by length),. + + # Do nothing types + + NOTHING no Match empty string. + # A variant of above which delimits a group, thus stops optimizations + TAIL no Match empty string. Can jump here from outside. + + # Loops + + # STAR,PLUS '?', and complex '*' and '+', are implemented as circular + # BRANCH structures using BACK. Simple cases (one character + # per match) are implemented with STAR and PLUS for speed + # and to minimize recursive plunges. + # + STAR node Match this (simple) thing 0 or more times. + PLUS node Match this (simple) thing 1 or more times. + + CURLY sv 2 Match this simple thing {n,m} times. + CURLYN no 2 Capture next-after-this simple thing + CURLYM no 2 Capture this medium-complex thing {n,m} times. + CURLYX sv 2 Match this complex thing {n,m} times. + + # This terminator creates a loop structure for CURLYX + WHILEM no Do curly processing and see if rest matches. + + # Buffer related + + # OPEN,CLOSE,GROUPP ...are numbered at compile time. + OPEN num 1 Mark this point in input as start of #n. + CLOSE num 1 Analogous to OPEN. + + REF num 1 Match some already matched string + REFF num 1 Match already matched string, folded using native charset + semantics for non-utf8 + REFFL num 1 Match already matched string, folded in loc. + REFFU num 1 Match already matched string, folded using unicode + semantics for non-utf8 + REFFA num 1 Match already matched string, folded using unicode + semantics for non-utf8, no mixing ASCII, non-ASCII + + # Named references. Code in regcomp.c assumes that these all are after the + # numbered references + NREF no-sv 1 Match some already matched string + NREFF no-sv 1 Match already matched string, folded using native charset + semantics for non-utf8 + NREFFL no-sv 1 Match already matched string, folded in loc. + NREFFU num 1 Match already matched string, folded using unicode + semantics for non-utf8 + NREFFA num 1 Match already matched string, folded using unicode + semantics for non-utf8, no mixing ASCII, non-ASCII + + IFMATCH off 1 2 Succeeds if the following matches. + UNLESSM off 1 2 Fails if the following matches. + SUSPEND off 1 1 "Independent" sub-RE. + IFTHEN off 1 1 Switch, should be preceded by switcher. + GROUPP num 1 Whether the group matched. + + # Support for long RE + + LONGJMP off 1 1 Jump far away. + BRANCHJ off 1 1 BRANCH with long offset. + + # The heavy worker + + EVAL evl 1 Execute some Perl code. + + # Modifiers + + MINMOD no Next operator is not greedy. + LOGICAL no Next opcode should set the flag only. + + # This is not used yet + RENUM off 1 1 Group with independently numbered parens. + + # Trie Related + + # Behave the same as A|LIST|OF|WORDS would. The '..C' variants have + # inline charclass data (ascii only), the 'C' store it in the structure. + # NOTE: the relative order of the TRIE-like regops is significant + + TRIE trie 1 Match many EXACT(F[ALU]?)? at once. flags==type + TRIEC charclass Same as TRIE, but with embedded charclass data + + # For start classes, contains an added fail table. + AHOCORASICK trie 1 Aho Corasick stclass. flags==type + AHOCORASICKC charclass Same as AHOCORASICK, but with embedded charclass data + + # Regex Subroutines + GOSUB num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 + GOSTART no recurse to start of pattern + + # Special conditionals + NGROUPP no-sv 1 Whether the group matched. + INSUBP num 1 Whether we are in a specific recurse. + DEFINEP none 1 Never execute directly. + + # Backtracking Verbs + ENDLIKE none Used only for the type field of verbs + OPFAIL none Same as (?!) + ACCEPT parno 1 Accepts the current matched string. + + + # Verbs With Arguments + VERB no-sv 1 Used only for the type field of verbs + PRUNE no-sv 1 Pattern fails at this startpoint if no-backtracking through this + MARKPOINT no-sv 1 Push the current location for rollback by cut. + SKIP no-sv 1 On failure skip forward (to the mark) before retrying + COMMIT no-sv 1 Pattern fails outright if backtracking through this + CUTGROUP no-sv 1 On failure go to the next alternation in the group + + # Control what to keep in $&. + KEEPS no $& begins here. + + # New charclass like patterns + LNBREAK none generic newline pattern + VERTWS none vertical whitespace (Perl 6) + NVERTWS none not vertical whitespace (Perl 6) + HORIZWS none horizontal whitespace (Perl 6) + NHORIZWS none not horizontal whitespace (Perl 6) + + FOLDCHAR codepoint 1 codepoint with tricky case folding properties. + + # SPECIAL REGOPS + + # This is not really a node, but an optimized away piece of a "long" node. + # To simplify debugging output, we mark it as if it were a node + OPTIMIZED off Placeholder for dump. + + # Special opcode with the property that no opcode in a compiled program + # will ever be of this type. Thus it can be used as a flag value that + # no other opcode has been seen. END is used similarly, in that an END + # node cant be optimized. So END implies "unoptimizable" and PSEUDO mean + # "not seen anything to optimize yet". + PSEUDO off Pseudo opcode for internal use. =for unprinted-credits Next section M-J. Dominus (mjd-perl-patch+@plover.com) 20010421 @@ -675,7 +797,7 @@ is, it corresponds to the C<+> symbol in the precompiled regex. C<0[0]> items indicate that there is no corresponding node. -=head2 Run-time output +=head2 Run-time Output First of all, when doing a match, one may get no run-time output even if debugging is enabled. This means that the regex engine was never @@ -683,7 +805,7 @@ entered and that all of the job was therefore done by the optimizer. If the regex engine was entered, the output may look like this: - Matching `[bc]d(ef*g)+h[ij]k$' against `abcdefg__gh__' + Matching '[bc]d(ef*g)+h[ij]k$' against 'abcdefg__gh__' Setting an EVAL scope, savestack=3 2 <ab> <cdefg__gh_> | 1: ANYOF 3 <abc> <defg__gh_> | 11: EXACT <d> @@ -718,7 +840,7 @@ C< >I<STRING-OFFSET> <I<PRE-STRING>> <I<POST-STRING>> |I<ID>: I<TYPE> The I<TYPE> info is indented with respect to the backtracking level. Other incidental information appears interspersed within. -=head1 Debugging Perl memory usage +=head1 Debugging Perl Memory Usage Perl is a profligate wastrel when it comes to memory use. There is a saying that to estimate memory usage of Perl, assume a reasonable @@ -754,7 +876,7 @@ The B<-DL> command-line switch is obsolete since circa Perl 5.6.0 The switch was used to track Perl's memory allocations and possible memory leaks. These days the use of malloc debugging tools like F<Purify> or F<valgrind> is suggested instead. See also -L<perlhack/PERL_MEM_LOG>. +L<perlhacktips/PERL_MEM_LOG>. One way to find out how much memory is being used by Perl data structures is to install the Devel::Size module from CPAN: it gives @@ -763,7 +885,7 @@ structure. Please be mindful of the difference between the size() and total_size(). If Perl has been compiled using Perl's malloc you can analyze Perl -memory usage by setting the $ENV{PERL_DEBUG_MSTATS}. +memory usage by setting $ENV{PERL_DEBUG_MSTATS}. =head2 Using C<$ENV{PERL_DEBUG_MSTATS}> @@ -811,7 +933,7 @@ would have usable size 8188, and the memory footprint would be 8192. In a Perl built for debugging, some buckets may have negative usable size. This means that these buckets cannot (and will not) be used. For larger buckets, the memory footprint may be one page greater -than a power of 2. If so, case the corresponding power of two is +than a power of 2. If so, the corresponding power of two is printed in the C<APPROX> field above. =item Free/Used @@ -829,7 +951,7 @@ were free: 8 16 32 64 128 256 512 1024 2048 4096 8192 4 12 24 48 80 -With non-C<DEBUGGING> perl, the buckets starting from C<128> have +With a non-C<DEBUGGING> perl, the buckets starting from C<128> have a 4-byte overhead, and thus an 8192-long bucket may take up to 8188-byte allocations. diff --git a/gnu/usr.bin/perl/pod/perldebtut.pod b/gnu/usr.bin/perl/pod/perldebtut.pod index b10f9b4066e..cc4f5051e17 100644 --- a/gnu/usr.bin/perl/pod/perldebtut.pod +++ b/gnu/usr.bin/perl/pod/perldebtut.pod @@ -512,7 +512,7 @@ using the list 'L' command: 17: print "$out $deg\n"; break if (1) -Note that to delete a breakpoint you use 'd' or 'D'. +Note that to delete a breakpoint you use 'B'. Now we'll continue down into our subroutine, this time rather than by line number, we'll use the subroutine name, followed by the now familiar 'v': @@ -702,7 +702,6 @@ place to go), and of course, experiment. L<perldebug>, L<perldebguts>, L<perldiag>, -L<dprofpp>, L<perlrun> diff --git a/gnu/usr.bin/perl/pod/perldtrace.pod b/gnu/usr.bin/perl/pod/perldtrace.pod new file mode 100644 index 00000000000..39551e17490 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perldtrace.pod @@ -0,0 +1,179 @@ +=head1 NAME + +perldtrace - Perl's support for DTrace + +=head1 SYNOPSIS + + # dtrace -Zn 'perl::sub-entry, perl::sub-return { trace(copyinstr(arg0)) }' + dtrace: description 'perl::sub-entry, perl::sub-return ' matched 10 probes + + # perl -E 'sub outer { inner(@_) } sub inner { say shift } outer("hello")' + hello + + (dtrace output) + CPU ID FUNCTION:NAME + 0 75915 Perl_pp_entersub:sub-entry BEGIN + 0 75915 Perl_pp_entersub:sub-entry import + 0 75922 Perl_pp_leavesub:sub-return import + 0 75922 Perl_pp_leavesub:sub-return BEGIN + 0 75915 Perl_pp_entersub:sub-entry outer + 0 75915 Perl_pp_entersub:sub-entry inner + 0 75922 Perl_pp_leavesub:sub-return inner + 0 75922 Perl_pp_leavesub:sub-return outer + +=head1 DESCRIPTION + +DTrace is a framework for comprehensive system- and application-level +tracing. Perl is a DTrace I<provider>, meaning it exposes several +I<probes> for instrumentation. You can use these in conjunction +with kernel-level probes, as well as probes from other providers +such as MySQL, in order to diagnose software defects, or even just +your application's bottlenecks. + +Perl must be compiled with the C<-Dusedtrace> option in order to +make use of the provided probes. While DTrace aims to have no +overhead when its instrumentation is not active, Perl's support +itself cannot uphold that guarantee, so it is built without DTrace +probes under most systems. One notable exception is that Mac OS X +ships a F</usr/bin/perl> with DTrace support enabled. + +=head1 HISTORY + +=over 4 + +=item 5.10.1 + +Perl's initial DTrace support was added, providing C<sub-entry> and +C<sub-return> probes. + +=item 5.14.0 + +The C<sub-entry> and C<sub-return> probes gain a fourth argument: the +package name of the function. + +=item 5.16.0 + +The C<phase-change> probe was added. + +=back + +=head1 PROBES + +=over 4 + +=item sub-entry(SUBNAME, FILE, LINE, PACKAGE) + +Traces the entry of any subroutine. Note that all of the variables +refer to the subroutine that is being invoked; there is currently +no way to get ahold of any information about the subroutine's +I<caller> from a DTrace action. + + :*perl*::sub-entry { + printf("%s::%s entered at %s line %d\n", + copyinstr(arg3), copyinstr(arg0), copyinstr(arg1), arg0); + } + +=item sub-return(SUBNAME, FILE, LINE, PACKAGE) + +Traces the exit of any subroutine. Note that all of the variables +refer to the subroutine that is returning; there is currently no +way to get ahold of any information about the subroutine's I<caller> +from a DTrace action. + + :*perl*::sub-return { + printf("%s::%s returned at %s line %d\n", + copyinstr(arg3), copyinstr(arg0), copyinstr(arg1), arg0); + } + +=item phase-change(NEWPHASE, OLDPHASE) + +Traces changes to Perl's interpreter state. You can internalize this +as tracing changes to Perl's C<${^GLOBAL_PHASE}> variable, especially +since the values for C<NEWPHASE> and C<OLDPHASE> are the strings that +C<${^GLOBAL_PHASE}> reports. + + :*perl*::phase-change { + printf("Phase changed from %s to %s\n", + copyinstr(arg1), copyinstr(arg0)); + } + +=back + +=head1 EXAMPLES + +=over 4 + +=item Most frequently called functions + + # dtrace -qZn 'sub-entry { @[strjoin(strjoin(copyinstr(arg3),"::"),copyinstr(arg0))] = count() } END {trunc(@, 10)}' + + Class::MOP::Attribute::slots 400 + Try::Tiny::catch 411 + Try::Tiny::try 411 + Class::MOP::Instance::inline_slot_access 451 + Class::MOP::Class::Immutable::Trait:::around 472 + Class::MOP::Mixin::AttributeCore::has_initializer 496 + Class::MOP::Method::Wrapped::__ANON__ 544 + Class::MOP::Package::_package_stash 737 + Class::MOP::Class::initialize 1128 + Class::MOP::get_metaclass_by_name 1204 + +=item Trace function calls + + # dtrace -qFZn 'sub-entry, sub-return { trace(copyinstr(arg0)) }' + + 0 -> Perl_pp_entersub BEGIN + 0 <- Perl_pp_leavesub BEGIN + 0 -> Perl_pp_entersub BEGIN + 0 -> Perl_pp_entersub import + 0 <- Perl_pp_leavesub import + 0 <- Perl_pp_leavesub BEGIN + 0 -> Perl_pp_entersub BEGIN + 0 -> Perl_pp_entersub dress + 0 <- Perl_pp_leavesub dress + 0 -> Perl_pp_entersub dirty + 0 <- Perl_pp_leavesub dirty + 0 -> Perl_pp_entersub whiten + 0 <- Perl_pp_leavesub whiten + 0 <- Perl_dounwind BEGIN + +=item Function calls during interpreter cleanup + + # dtrace -Zn 'phase-change /copyinstr(arg0) == "END"/ { self->ending = 1 } sub-entry /self->ending/ { trace(copyinstr(arg0)) }' + + CPU ID FUNCTION:NAME + 1 77214 Perl_pp_entersub:sub-entry END + 1 77214 Perl_pp_entersub:sub-entry END + 1 77214 Perl_pp_entersub:sub-entry cleanup + 1 77214 Perl_pp_entersub:sub-entry _force_writable + 1 77214 Perl_pp_entersub:sub-entry _force_writable + +=item System calls at compile time + + # dtrace -qZn 'phase-change /copyinstr(arg0) == "START"/ { self->interesting = 1 } phase-change /copyinstr(arg0) == "RUN"/ { self->interesting = 0 } syscall::: /self->interesting/ { @[probefunc] = count() } END { trunc(@, 3) }' + + lseek 310 + read 374 + stat64 1056 + +=back + +=head1 REFERENCES + +=over 4 + +=item DTrace User Guide + +L<http://download.oracle.com/docs/cd/E19082-01/819-3620/index.html> + +=item DTrace: Dynamic Tracing in Oracle Solaris, Mac OS X and FreeBSD + +L<http://www.amazon.com/DTrace-Dynamic-Tracing-Solaris-FreeBSD/dp/0132091518/> + +=back + +=head1 AUTHORS + +Shawn M Moore C<sartak@gmail.com> + +=cut diff --git a/gnu/usr.bin/perl/pod/perlebcdic.pod b/gnu/usr.bin/perl/pod/perlebcdic.pod index 28d47b9551d..ecd0676415f 100644 --- a/gnu/usr.bin/perl/pod/perlebcdic.pod +++ b/gnu/usr.bin/perl/pod/perlebcdic.pod @@ -1,3 +1,5 @@ +=encoding utf8 + =head1 NAME perlebcdic - Considerations for running Perl on EBCDIC platforms @@ -6,7 +8,7 @@ perlebcdic - Considerations for running Perl on EBCDIC platforms An exploration of some of the issues facing Perl programmers on EBCDIC based computers. We do not cover localization, -internationalization, or multi byte character set issues other +internationalization, or multi-byte character set issues other than some discussion of UTF-8 and UTF-EBCDIC. Portions that are still incomplete are marked with XXX. @@ -24,7 +26,7 @@ set of integers running from 0 to 127 (decimal) that imply character interpretation by the display and other systems of computers. The range 0..127 can be covered by setting the bits in a 7-bit binary -digit, hence the set is sometimes referred to as a "7-bit ASCII". +digit, hence the set is sometimes referred to as "7-bit ASCII". ASCII was described by the American National Standards Institute document ANSI X3.4-1986. It was also described by ISO 646:1991 (with localization for currency symbols). The full ASCII set is @@ -61,18 +63,16 @@ also known as CCSID 819 (or sometimes 0819 or even 00819). =head2 EBCDIC The Extended Binary Coded Decimal Interchange Code refers to a -large collection of single and multi byte coded character sets that are +large collection of single- and multi-byte coded character sets that are different from ASCII or ISO 8859-1 and are all slightly different from each other; they typically run on host computers. The EBCDIC encodings derive from -8 bit byte extensions of Hollerith punched card encodings. The layout on the +8-bit byte extensions of Hollerith punched card encodings. The layout on the cards was such that high bits were set for the upper and lower case alphabet characters [a-z] and [A-Z], but there were gaps within each Latin alphabet range. Some IBM EBCDIC character sets may be known by character code set -identification numbers (CCSID numbers) or code page numbers. Leading -zero digits in CCSID numbers within this document are insignificant. -E.g. CCSID 0037 may be referred to as 37 in places. +identification numbers (CCSID numbers) or code page numbers. Perl can be compiled on platforms that run any of three commonly used EBCDIC character sets, listed below. @@ -97,7 +97,7 @@ They are: Character code set ID 0037 is a mapping of the ASCII plus Latin-1 characters (i.e. ISO 8859-1) to an EBCDIC set. 0037 is used in North American English locales on the OS/400 operating system -that runs on AS/400 computers. CCSID 37 differs from ISO 8859-1 +that runs on AS/400 computers. CCSID 0037 differs from ISO 8859-1 in 237 places, in other words they agree on only 19 code point values. =head2 1047 @@ -124,8 +124,8 @@ The problem is: which code points to use for code points less than 256? In EBCDIC, for the low 256 the EBCDIC code points are used. This means that the equivalences - pack("U", ord($character)) eq $character - unpack("U", $character) == ord $character + pack("U", ord($character)) eq $character + unpack("U", $character) == ord $character will hold. (If Unicode code points were applied consistently over all the possible code points, pack("U",ord("A")) would in EBCDIC @@ -182,23 +182,23 @@ to translate from EBCDIC to Latin-1 code points. Encode knows about more EBCDIC character sets than Perl can currently be compiled to run on. - use Encode 'from_to'; + use Encode 'from_to'; - my %ebcdic = ( 176 => 'cp37', 95 => 'cp1047', 106 => 'posix-bc' ); + my %ebcdic = ( 176 => 'cp37', 95 => 'cp1047', 106 => 'posix-bc' ); - # $a is in EBCDIC code points - from_to($a, $ebcdic{ord '^'}, 'latin1'); - # $a is ISO 8859-1 code points + # $a is in EBCDIC code points + from_to($a, $ebcdic{ord '^'}, 'latin1'); + # $a is ISO 8859-1 code points and from Latin-1 code points to EBCDIC code points - use Encode 'from_to'; + use Encode 'from_to'; - my %ebcdic = ( 176 => 'cp37', 95 => 'cp1047', 106 => 'posix-bc' ); + my %ebcdic = ( 176 => 'cp37', 95 => 'cp1047', 106 => 'posix-bc' ); - # $a is ISO 8859-1 code points - from_to($a, 'latin1', $ebcdic{ord '^'}); - # $a is in EBCDIC code points + # $a is ISO 8859-1 code points + from_to($a, 'latin1', $ebcdic{ord '^'}); + # $a is in EBCDIC code points For doing I/O it is suggested that you use the autotranslating features of PerlIO, see L<perluniintro>. @@ -216,7 +216,7 @@ you to use different encodings per IO channel. For example you may use open($f, ">:encoding(utf8)", "test.utf8"); print $f "Hello World!\n"; -to get four files containing "Hello World!\n" in ASCII, CP 37 EBCDIC, +to get four files containing "Hello World!\n" in ASCII, CP 0037 EBCDIC, ISO 8859-1 (Latin-1) (in this example identical to ASCII since only ASCII characters were printed), and UTF-EBCDIC (in this example identical to normal EBCDIC since only characters @@ -236,10 +236,11 @@ extensions to ASCII have been labelled with character names roughly corresponding to I<The Unicode Standard, Version 3.0> albeit with substitutions such as s/LATIN// and s/VULGAR// in all cases, s/CAPITAL LETTER// in some cases, and s/SMALL LETTER ([A-Z])/\l$1/ -in some other cases (the C<charnames> pragma names unfortunately do -not list explicit names for the C0 or C1 control characters). The -"names" of the C1 control set (128..159 in ISO 8859-1) listed here are -somewhat arbitrary. The differences between the 0037 and 1047 sets are +in some other cases. The "names" of the controls listed here are +the Unicode Version 1 names, except for the few that don't have names, in which +case the names in the Wikipedia article were used +(L<http://en.wikipedia.org/wiki/C0_and_C1_control_codes>). +The differences between the 0037 and 1047 sets are flagged with ***. The differences between the 1047 and POSIX-BC sets are flagged with ###. All ord() numbers listed are decimal. If you would rather see this table listing octal values then run the table @@ -252,8 +253,9 @@ work with a pod2_other_format translation) through: =back - perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ - -e '{printf("%s%-9o%-9o%-9o%o\n",$1,$2,$3,$4,$5)}' perlebcdic.pod + perl -ne 'if(/(.{43})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ + -e '{printf("%s%-9.03o%-9.03o%-9.03o%.03o\n",$1,$2,$3,$4,$5)}' \ + perlebcdic.pod If you want to retain the UTF-x code points then in script form you might want to write: @@ -264,20 +266,25 @@ might want to write: =back - open(FH,"<perlebcdic.pod") or die "Could not open perlebcdic.pod: $!"; - while (<FH>) { - if (/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*)\s+(\d+)\.?(\d*)/) { - if ($7 ne '' && $9 ne '') { - printf("%s%-9o%-9o%-9o%-9o%-3o.%-5o%-3o.%o\n",$1,$2,$3,$4,$5,$6,$7,$8,$9); - } - elsif ($7 ne '') { - printf("%s%-9o%-9o%-9o%-9o%-3o.%-5o%o\n",$1,$2,$3,$4,$5,$6,$7,$8); - } - else { - printf("%s%-9o%-9o%-9o%-9o%-9o%o\n",$1,$2,$3,$4,$5,$6,$8); - } - } - } + open(FH,"<perlebcdic.pod") or die "Could not open perlebcdic.pod: $!"; + while (<FH>) { + if (/(.{43})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*)\s+(\d+)\.?(\d*)/) + { + if ($7 ne '' && $9 ne '') { + printf( + "%s%-9.03o%-9.03o%-9.03o%-9.03o%-3o.%-5o%-3o.%.03o\n", + $1,$2,$3,$4,$5,$6,$7,$8,$9); + } + elsif ($7 ne '') { + printf("%s%-9.03o%-9.03o%-9.03o%-9.03o%-3o.%-5o%.03o\n", + $1,$2,$3,$4,$5,$6,$7,$8); + } + else { + printf("%s%-9.03o%-9.03o%-9.03o%-9.03o%-9.03o%.03o\n", + $1,$2,$3,$4,$5,$6,$8); + } + } + } If you would rather see this table listing hexadecimal values then run the table through: @@ -288,8 +295,9 @@ run the table through: =back - perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ - -e '{printf("%s%-9X%-9X%-9X%X\n",$1,$2,$3,$4,$5)}' perlebcdic.pod + perl -ne 'if(/(.{43})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ + -e '{printf("%s%-9.02X%-9.02X%-9.02X%.02X\n",$1,$2,$3,$4,$5)}' \ + perlebcdic.pod Or, in order to retain the UTF-x code points in hexadecimal: @@ -299,282 +307,286 @@ Or, in order to retain the UTF-x code points in hexadecimal: =back - open(FH,"<perlebcdic.pod") or die "Could not open perlebcdic.pod: $!"; - while (<FH>) { - if (/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*)\s+(\d+)\.?(\d*)/) { - if ($7 ne '' && $9 ne '') { - printf("%s%-9X%-9X%-9X%-9X%-2X.%-6X%-2X.%X\n",$1,$2,$3,$4,$5,$6,$7,$8,$9); - } - elsif ($7 ne '') { - printf("%s%-9X%-9X%-9X%-9X%-2X.%-6X%X\n",$1,$2,$3,$4,$5,$6,$7,$8); - } - else { - printf("%s%-9X%-9X%-9X%-9X%-9X%X\n",$1,$2,$3,$4,$5,$6,$8); - } - } - } - - - incomp- incomp- - 8859-1 lete lete - chr 0819 0037 1047 POSIX-BC UTF-8 UTF-EBCDIC - ------------------------------------------------------------------------------------ - <NULL> 0 0 0 0 0 0 - <START OF HEADING> 1 1 1 1 1 1 - <START OF TEXT> 2 2 2 2 2 2 - <END OF TEXT> 3 3 3 3 3 3 - <END OF TRANSMISSION> 4 55 55 55 4 55 - <ENQUIRY> 5 45 45 45 5 45 - <ACKNOWLEDGE> 6 46 46 46 6 46 - <BELL> 7 47 47 47 7 47 - <BACKSPACE> 8 22 22 22 8 22 - <HORIZONTAL TABULATION> 9 5 5 5 9 5 - <LINE FEED> 10 37 21 21 10 21 *** - <VERTICAL TABULATION> 11 11 11 11 11 11 - <FORM FEED> 12 12 12 12 12 12 - <CARRIAGE RETURN> 13 13 13 13 13 13 - <SHIFT OUT> 14 14 14 14 14 14 - <SHIFT IN> 15 15 15 15 15 15 - <DATA LINK ESCAPE> 16 16 16 16 16 16 - <DEVICE CONTROL ONE> 17 17 17 17 17 17 - <DEVICE CONTROL TWO> 18 18 18 18 18 18 - <DEVICE CONTROL THREE> 19 19 19 19 19 19 - <DEVICE CONTROL FOUR> 20 60 60 60 20 60 - <NEGATIVE ACKNOWLEDGE> 21 61 61 61 21 61 - <SYNCHRONOUS IDLE> 22 50 50 50 22 50 - <END OF TRANSMISSION BLOCK> 23 38 38 38 23 38 - <CANCEL> 24 24 24 24 24 24 - <END OF MEDIUM> 25 25 25 25 25 25 - <SUBSTITUTE> 26 63 63 63 26 63 - <ESCAPE> 27 39 39 39 27 39 - <FILE SEPARATOR> 28 28 28 28 28 28 - <GROUP SEPARATOR> 29 29 29 29 29 29 - <RECORD SEPARATOR> 30 30 30 30 30 30 - <UNIT SEPARATOR> 31 31 31 31 31 31 - <SPACE> 32 64 64 64 32 64 - ! 33 90 90 90 33 90 - " 34 127 127 127 34 127 - # 35 123 123 123 35 123 - $ 36 91 91 91 36 91 - % 37 108 108 108 37 108 - & 38 80 80 80 38 80 - ' 39 125 125 125 39 125 - ( 40 77 77 77 40 77 - ) 41 93 93 93 41 93 - * 42 92 92 92 42 92 - + 43 78 78 78 43 78 - , 44 107 107 107 44 107 - - 45 96 96 96 45 96 - . 46 75 75 75 46 75 - / 47 97 97 97 47 97 - 0 48 240 240 240 48 240 - 1 49 241 241 241 49 241 - 2 50 242 242 242 50 242 - 3 51 243 243 243 51 243 - 4 52 244 244 244 52 244 - 5 53 245 245 245 53 245 - 6 54 246 246 246 54 246 - 7 55 247 247 247 55 247 - 8 56 248 248 248 56 248 - 9 57 249 249 249 57 249 - : 58 122 122 122 58 122 - ; 59 94 94 94 59 94 - < 60 76 76 76 60 76 - = 61 126 126 126 61 126 - > 62 110 110 110 62 110 - ? 63 111 111 111 63 111 - @ 64 124 124 124 64 124 - A 65 193 193 193 65 193 - B 66 194 194 194 66 194 - C 67 195 195 195 67 195 - D 68 196 196 196 68 196 - E 69 197 197 197 69 197 - F 70 198 198 198 70 198 - G 71 199 199 199 71 199 - H 72 200 200 200 72 200 - I 73 201 201 201 73 201 - J 74 209 209 209 74 209 - K 75 210 210 210 75 210 - L 76 211 211 211 76 211 - M 77 212 212 212 77 212 - N 78 213 213 213 78 213 - O 79 214 214 214 79 214 - P 80 215 215 215 80 215 - Q 81 216 216 216 81 216 - R 82 217 217 217 82 217 - S 83 226 226 226 83 226 - T 84 227 227 227 84 227 - U 85 228 228 228 85 228 - V 86 229 229 229 86 229 - W 87 230 230 230 87 230 - X 88 231 231 231 88 231 - Y 89 232 232 232 89 232 - Z 90 233 233 233 90 233 - [ 91 186 173 187 91 173 *** ### - \ 92 224 224 188 92 224 ### - ] 93 187 189 189 93 189 *** - ^ 94 176 95 106 94 95 *** ### - _ 95 109 109 109 95 109 - ` 96 121 121 74 96 121 ### - a 97 129 129 129 97 129 - b 98 130 130 130 98 130 - c 99 131 131 131 99 131 - d 100 132 132 132 100 132 - e 101 133 133 133 101 133 - f 102 134 134 134 102 134 - g 103 135 135 135 103 135 - h 104 136 136 136 104 136 - i 105 137 137 137 105 137 - j 106 145 145 145 106 145 - k 107 146 146 146 107 146 - l 108 147 147 147 108 147 - m 109 148 148 148 109 148 - n 110 149 149 149 110 149 - o 111 150 150 150 111 150 - p 112 151 151 151 112 151 - q 113 152 152 152 113 152 - r 114 153 153 153 114 153 - s 115 162 162 162 115 162 - t 116 163 163 163 116 163 - u 117 164 164 164 117 164 - v 118 165 165 165 118 165 - w 119 166 166 166 119 166 - x 120 167 167 167 120 167 - y 121 168 168 168 121 168 - z 122 169 169 169 122 169 - { 123 192 192 251 123 192 ### - | 124 79 79 79 124 79 - } 125 208 208 253 125 208 ### - ~ 126 161 161 255 126 161 ### - <DELETE> 127 7 7 7 127 7 - <C1 0> 128 32 32 32 194.128 32 - <C1 1> 129 33 33 33 194.129 33 - <C1 2> 130 34 34 34 194.130 34 - <C1 3> 131 35 35 35 194.131 35 - <C1 4> 132 36 36 36 194.132 36 - <C1 5> 133 21 37 37 194.133 37 *** - <C1 6> 134 6 6 6 194.134 6 - <C1 7> 135 23 23 23 194.135 23 - <C1 8> 136 40 40 40 194.136 40 - <C1 9> 137 41 41 41 194.137 41 - <C1 10> 138 42 42 42 194.138 42 - <C1 11> 139 43 43 43 194.139 43 - <C1 12> 140 44 44 44 194.140 44 - <C1 13> 141 9 9 9 194.141 9 - <C1 14> 142 10 10 10 194.142 10 - <C1 15> 143 27 27 27 194.143 27 - <C1 16> 144 48 48 48 194.144 48 - <C1 17> 145 49 49 49 194.145 49 - <C1 18> 146 26 26 26 194.146 26 - <C1 19> 147 51 51 51 194.147 51 - <C1 20> 148 52 52 52 194.148 52 - <C1 21> 149 53 53 53 194.149 53 - <C1 22> 150 54 54 54 194.150 54 - <C1 23> 151 8 8 8 194.151 8 - <C1 24> 152 56 56 56 194.152 56 - <C1 25> 153 57 57 57 194.153 57 - <C1 26> 154 58 58 58 194.154 58 - <C1 27> 155 59 59 59 194.155 59 - <C1 28> 156 4 4 4 194.156 4 - <C1 29> 157 20 20 20 194.157 20 - <C1 30> 158 62 62 62 194.158 62 - <C1 31> 159 255 255 95 194.159 255 ### - <NON-BREAKING SPACE> 160 65 65 65 194.160 128.65 - <INVERTED EXCLAMATION MARK> 161 170 170 170 194.161 128.66 - <CENT SIGN> 162 74 74 176 194.162 128.67 ### - <POUND SIGN> 163 177 177 177 194.163 128.68 - <CURRENCY SIGN> 164 159 159 159 194.164 128.69 - <YEN SIGN> 165 178 178 178 194.165 128.70 - <BROKEN BAR> 166 106 106 208 194.166 128.71 ### - <SECTION SIGN> 167 181 181 181 194.167 128.72 - <DIAERESIS> 168 189 187 121 194.168 128.73 *** ### - <COPYRIGHT SIGN> 169 180 180 180 194.169 128.74 - <FEMININE ORDINAL INDICATOR> 170 154 154 154 194.170 128.81 - <LEFT POINTING GUILLEMET> 171 138 138 138 194.171 128.82 - <NOT SIGN> 172 95 176 186 194.172 128.83 *** ### - <SOFT HYPHEN> 173 202 202 202 194.173 128.84 - <REGISTERED TRADE MARK SIGN> 174 175 175 175 194.174 128.85 - <MACRON> 175 188 188 161 194.175 128.86 ### - <DEGREE SIGN> 176 144 144 144 194.176 128.87 - <PLUS-OR-MINUS SIGN> 177 143 143 143 194.177 128.88 - <SUPERSCRIPT TWO> 178 234 234 234 194.178 128.89 - <SUPERSCRIPT THREE> 179 250 250 250 194.179 128.98 - <ACUTE ACCENT> 180 190 190 190 194.180 128.99 - <MICRO SIGN> 181 160 160 160 194.181 128.100 - <PARAGRAPH SIGN> 182 182 182 182 194.182 128.101 - <MIDDLE DOT> 183 179 179 179 194.183 128.102 - <CEDILLA> 184 157 157 157 194.184 128.103 - <SUPERSCRIPT ONE> 185 218 218 218 194.185 128.104 - <MASC. ORDINAL INDICATOR> 186 155 155 155 194.186 128.105 - <RIGHT POINTING GUILLEMET> 187 139 139 139 194.187 128.106 - <FRACTION ONE QUARTER> 188 183 183 183 194.188 128.112 - <FRACTION ONE HALF> 189 184 184 184 194.189 128.113 - <FRACTION THREE QUARTERS> 190 185 185 185 194.190 128.114 - <INVERTED QUESTION MARK> 191 171 171 171 194.191 128.115 - <A WITH GRAVE> 192 100 100 100 195.128 138.65 - <A WITH ACUTE> 193 101 101 101 195.129 138.66 - <A WITH CIRCUMFLEX> 194 98 98 98 195.130 138.67 - <A WITH TILDE> 195 102 102 102 195.131 138.68 - <A WITH DIAERESIS> 196 99 99 99 195.132 138.69 - <A WITH RING ABOVE> 197 103 103 103 195.133 138.70 - <CAPITAL LIGATURE AE> 198 158 158 158 195.134 138.71 - <C WITH CEDILLA> 199 104 104 104 195.135 138.72 - <E WITH GRAVE> 200 116 116 116 195.136 138.73 - <E WITH ACUTE> 201 113 113 113 195.137 138.74 - <E WITH CIRCUMFLEX> 202 114 114 114 195.138 138.81 - <E WITH DIAERESIS> 203 115 115 115 195.139 138.82 - <I WITH GRAVE> 204 120 120 120 195.140 138.83 - <I WITH ACUTE> 205 117 117 117 195.141 138.84 - <I WITH CIRCUMFLEX> 206 118 118 118 195.142 138.85 - <I WITH DIAERESIS> 207 119 119 119 195.143 138.86 - <CAPITAL LETTER ETH> 208 172 172 172 195.144 138.87 - <N WITH TILDE> 209 105 105 105 195.145 138.88 - <O WITH GRAVE> 210 237 237 237 195.146 138.89 - <O WITH ACUTE> 211 238 238 238 195.147 138.98 - <O WITH CIRCUMFLEX> 212 235 235 235 195.148 138.99 - <O WITH TILDE> 213 239 239 239 195.149 138.100 - <O WITH DIAERESIS> 214 236 236 236 195.150 138.101 - <MULTIPLICATION SIGN> 215 191 191 191 195.151 138.102 - <O WITH STROKE> 216 128 128 128 195.152 138.103 - <U WITH GRAVE> 217 253 253 224 195.153 138.104 ### - <U WITH ACUTE> 218 254 254 254 195.154 138.105 - <U WITH CIRCUMFLEX> 219 251 251 221 195.155 138.106 ### - <U WITH DIAERESIS> 220 252 252 252 195.156 138.112 - <Y WITH ACUTE> 221 173 186 173 195.157 138.113 *** ### - <CAPITAL LETTER THORN> 222 174 174 174 195.158 138.114 - <SMALL LETTER SHARP S> 223 89 89 89 195.159 138.115 - <a WITH GRAVE> 224 68 68 68 195.160 139.65 - <a WITH ACUTE> 225 69 69 69 195.161 139.66 - <a WITH CIRCUMFLEX> 226 66 66 66 195.162 139.67 - <a WITH TILDE> 227 70 70 70 195.163 139.68 - <a WITH DIAERESIS> 228 67 67 67 195.164 139.69 - <a WITH RING ABOVE> 229 71 71 71 195.165 139.70 - <SMALL LIGATURE ae> 230 156 156 156 195.166 139.71 - <c WITH CEDILLA> 231 72 72 72 195.167 139.72 - <e WITH GRAVE> 232 84 84 84 195.168 139.73 - <e WITH ACUTE> 233 81 81 81 195.169 139.74 - <e WITH CIRCUMFLEX> 234 82 82 82 195.170 139.81 - <e WITH DIAERESIS> 235 83 83 83 195.171 139.82 - <i WITH GRAVE> 236 88 88 88 195.172 139.83 - <i WITH ACUTE> 237 85 85 85 195.173 139.84 - <i WITH CIRCUMFLEX> 238 86 86 86 195.174 139.85 - <i WITH DIAERESIS> 239 87 87 87 195.175 139.86 - <SMALL LETTER eth> 240 140 140 140 195.176 139.87 - <n WITH TILDE> 241 73 73 73 195.177 139.88 - <o WITH GRAVE> 242 205 205 205 195.178 139.89 - <o WITH ACUTE> 243 206 206 206 195.179 139.98 - <o WITH CIRCUMFLEX> 244 203 203 203 195.180 139.99 - <o WITH TILDE> 245 207 207 207 195.181 139.100 - <o WITH DIAERESIS> 246 204 204 204 195.182 139.101 - <DIVISION SIGN> 247 225 225 225 195.183 139.102 - <o WITH STROKE> 248 112 112 112 195.184 139.103 - <u WITH GRAVE> 249 221 221 192 195.185 139.104 ### - <u WITH ACUTE> 250 222 222 222 195.186 139.105 - <u WITH CIRCUMFLEX> 251 219 219 219 195.187 139.106 - <u WITH DIAERESIS> 252 220 220 220 195.188 139.112 - <y WITH ACUTE> 253 141 141 141 195.189 139.113 - <SMALL LETTER thorn> 254 142 142 142 195.190 139.114 - <y WITH DIAERESIS> 255 223 223 223 195.191 139.115 + open(FH,"<perlebcdic.pod") or die "Could not open perlebcdic.pod: $!"; + while (<FH>) { + if (/(.{43})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*)\s+(\d+)\.?(\d*)/) + { + if ($7 ne '' && $9 ne '') { + printf( + "%s%-9.02X%-9.02X%-9.02X%-9.02X%-2X.%-6.02X%02X.%02X\n", + $1,$2,$3,$4,$5,$6,$7,$8,$9); + } + elsif ($7 ne '') { + printf("%s%-9.02X%-9.02X%-9.02X%-9.02X%-2X.%-6.02X%02X\n", + $1,$2,$3,$4,$5,$6,$7,$8); + } + else { + printf("%s%-9.02X%-9.02X%-9.02X%-9.02X%-9.02X%02X\n", + $1,$2,$3,$4,$5,$6,$8); + } + } + } + + + ISO 8859-1 CCSID CCSID CCSID 1047 + chr CCSID 0819 0037 1047 POSIX-BC UTF-8 UTF-EBCDIC + ---------------------------------------------------------------------------------------------- + <NULL> 0 0 0 0 0 0 + <START OF HEADING> 1 1 1 1 1 1 + <START OF TEXT> 2 2 2 2 2 2 + <END OF TEXT> 3 3 3 3 3 3 + <END OF TRANSMISSION> 4 55 55 55 4 55 + <ENQUIRY> 5 45 45 45 5 45 + <ACKNOWLEDGE> 6 46 46 46 6 46 + <BELL> 7 47 47 47 7 47 + <BACKSPACE> 8 22 22 22 8 22 + <HORIZONTAL TABULATION> 9 5 5 5 9 5 + <LINE FEED> 10 37 21 21 10 21 *** + <VERTICAL TABULATION> 11 11 11 11 11 11 + <FORM FEED> 12 12 12 12 12 12 + <CARRIAGE RETURN> 13 13 13 13 13 13 + <SHIFT OUT> 14 14 14 14 14 14 + <SHIFT IN> 15 15 15 15 15 15 + <DATA LINK ESCAPE> 16 16 16 16 16 16 + <DEVICE CONTROL ONE> 17 17 17 17 17 17 + <DEVICE CONTROL TWO> 18 18 18 18 18 18 + <DEVICE CONTROL THREE> 19 19 19 19 19 19 + <DEVICE CONTROL FOUR> 20 60 60 60 20 60 + <NEGATIVE ACKNOWLEDGE> 21 61 61 61 21 61 + <SYNCHRONOUS IDLE> 22 50 50 50 22 50 + <END OF TRANSMISSION BLOCK> 23 38 38 38 23 38 + <CANCEL> 24 24 24 24 24 24 + <END OF MEDIUM> 25 25 25 25 25 25 + <SUBSTITUTE> 26 63 63 63 26 63 + <ESCAPE> 27 39 39 39 27 39 + <FILE SEPARATOR> 28 28 28 28 28 28 + <GROUP SEPARATOR> 29 29 29 29 29 29 + <RECORD SEPARATOR> 30 30 30 30 30 30 + <UNIT SEPARATOR> 31 31 31 31 31 31 + <SPACE> 32 64 64 64 32 64 + ! 33 90 90 90 33 90 + " 34 127 127 127 34 127 + # 35 123 123 123 35 123 + $ 36 91 91 91 36 91 + % 37 108 108 108 37 108 + & 38 80 80 80 38 80 + ' 39 125 125 125 39 125 + ( 40 77 77 77 40 77 + ) 41 93 93 93 41 93 + * 42 92 92 92 42 92 + + 43 78 78 78 43 78 + , 44 107 107 107 44 107 + - 45 96 96 96 45 96 + . 46 75 75 75 46 75 + / 47 97 97 97 47 97 + 0 48 240 240 240 48 240 + 1 49 241 241 241 49 241 + 2 50 242 242 242 50 242 + 3 51 243 243 243 51 243 + 4 52 244 244 244 52 244 + 5 53 245 245 245 53 245 + 6 54 246 246 246 54 246 + 7 55 247 247 247 55 247 + 8 56 248 248 248 56 248 + 9 57 249 249 249 57 249 + : 58 122 122 122 58 122 + ; 59 94 94 94 59 94 + < 60 76 76 76 60 76 + = 61 126 126 126 61 126 + > 62 110 110 110 62 110 + ? 63 111 111 111 63 111 + @ 64 124 124 124 64 124 + A 65 193 193 193 65 193 + B 66 194 194 194 66 194 + C 67 195 195 195 67 195 + D 68 196 196 196 68 196 + E 69 197 197 197 69 197 + F 70 198 198 198 70 198 + G 71 199 199 199 71 199 + H 72 200 200 200 72 200 + I 73 201 201 201 73 201 + J 74 209 209 209 74 209 + K 75 210 210 210 75 210 + L 76 211 211 211 76 211 + M 77 212 212 212 77 212 + N 78 213 213 213 78 213 + O 79 214 214 214 79 214 + P 80 215 215 215 80 215 + Q 81 216 216 216 81 216 + R 82 217 217 217 82 217 + S 83 226 226 226 83 226 + T 84 227 227 227 84 227 + U 85 228 228 228 85 228 + V 86 229 229 229 86 229 + W 87 230 230 230 87 230 + X 88 231 231 231 88 231 + Y 89 232 232 232 89 232 + Z 90 233 233 233 90 233 + [ 91 186 173 187 91 173 *** ### + \ 92 224 224 188 92 224 ### + ] 93 187 189 189 93 189 *** + ^ 94 176 95 106 94 95 *** ### + _ 95 109 109 109 95 109 + ` 96 121 121 74 96 121 ### + a 97 129 129 129 97 129 + b 98 130 130 130 98 130 + c 99 131 131 131 99 131 + d 100 132 132 132 100 132 + e 101 133 133 133 101 133 + f 102 134 134 134 102 134 + g 103 135 135 135 103 135 + h 104 136 136 136 104 136 + i 105 137 137 137 105 137 + j 106 145 145 145 106 145 + k 107 146 146 146 107 146 + l 108 147 147 147 108 147 + m 109 148 148 148 109 148 + n 110 149 149 149 110 149 + o 111 150 150 150 111 150 + p 112 151 151 151 112 151 + q 113 152 152 152 113 152 + r 114 153 153 153 114 153 + s 115 162 162 162 115 162 + t 116 163 163 163 116 163 + u 117 164 164 164 117 164 + v 118 165 165 165 118 165 + w 119 166 166 166 119 166 + x 120 167 167 167 120 167 + y 121 168 168 168 121 168 + z 122 169 169 169 122 169 + { 123 192 192 251 123 192 ### + | 124 79 79 79 124 79 + } 125 208 208 253 125 208 ### + ~ 126 161 161 255 126 161 ### + <DELETE> 127 7 7 7 127 7 + <PADDING CHARACTER> 128 32 32 32 194.128 32 + <HIGH OCTET PRESET> 129 33 33 33 194.129 33 + <BREAK PERMITTED HERE> 130 34 34 34 194.130 34 + <NO BREAK HERE> 131 35 35 35 194.131 35 + <INDEX> 132 36 36 36 194.132 36 + <NEXT LINE> 133 21 37 37 194.133 37 *** + <START OF SELECTED AREA> 134 6 6 6 194.134 6 + <END OF SELECTED AREA> 135 23 23 23 194.135 23 + <CHARACTER TABULATION SET> 136 40 40 40 194.136 40 + <CHARACTER TABULATION WITH JUSTIFICATION> 137 41 41 41 194.137 41 + <LINE TABULATION SET> 138 42 42 42 194.138 42 + <PARTIAL LINE FORWARD> 139 43 43 43 194.139 43 + <PARTIAL LINE BACKWARD> 140 44 44 44 194.140 44 + <REVERSE LINE FEED> 141 9 9 9 194.141 9 + <SINGLE SHIFT TWO> 142 10 10 10 194.142 10 + <SINGLE SHIFT THREE> 143 27 27 27 194.143 27 + <DEVICE CONTROL STRING> 144 48 48 48 194.144 48 + <PRIVATE USE ONE> 145 49 49 49 194.145 49 + <PRIVATE USE TWO> 146 26 26 26 194.146 26 + <SET TRANSMIT STATE> 147 51 51 51 194.147 51 + <CANCEL CHARACTER> 148 52 52 52 194.148 52 + <MESSAGE WAITING> 149 53 53 53 194.149 53 + <START OF GUARDED AREA> 150 54 54 54 194.150 54 + <END OF GUARDED AREA> 151 8 8 8 194.151 8 + <START OF STRING> 152 56 56 56 194.152 56 + <SINGLE GRAPHIC CHARACTER INTRODUCER> 153 57 57 57 194.153 57 + <SINGLE CHARACTER INTRODUCER> 154 58 58 58 194.154 58 + <CONTROL SEQUENCE INTRODUCER> 155 59 59 59 194.155 59 + <STRING TERMINATOR> 156 4 4 4 194.156 4 + <OPERATING SYSTEM COMMAND> 157 20 20 20 194.157 20 + <PRIVACY MESSAGE> 158 62 62 62 194.158 62 + <APPLICATION PROGRAM COMMAND> 159 255 255 95 194.159 255 ### + <NON-BREAKING SPACE> 160 65 65 65 194.160 128.65 + <INVERTED EXCLAMATION MARK> 161 170 170 170 194.161 128.66 + <CENT SIGN> 162 74 74 176 194.162 128.67 ### + <POUND SIGN> 163 177 177 177 194.163 128.68 + <CURRENCY SIGN> 164 159 159 159 194.164 128.69 + <YEN SIGN> 165 178 178 178 194.165 128.70 + <BROKEN BAR> 166 106 106 208 194.166 128.71 ### + <SECTION SIGN> 167 181 181 181 194.167 128.72 + <DIAERESIS> 168 189 187 121 194.168 128.73 *** ### + <COPYRIGHT SIGN> 169 180 180 180 194.169 128.74 + <FEMININE ORDINAL INDICATOR> 170 154 154 154 194.170 128.81 + <LEFT POINTING GUILLEMET> 171 138 138 138 194.171 128.82 + <NOT SIGN> 172 95 176 186 194.172 128.83 *** ### + <SOFT HYPHEN> 173 202 202 202 194.173 128.84 + <REGISTERED TRADE MARK SIGN> 174 175 175 175 194.174 128.85 + <MACRON> 175 188 188 161 194.175 128.86 ### + <DEGREE SIGN> 176 144 144 144 194.176 128.87 + <PLUS-OR-MINUS SIGN> 177 143 143 143 194.177 128.88 + <SUPERSCRIPT TWO> 178 234 234 234 194.178 128.89 + <SUPERSCRIPT THREE> 179 250 250 250 194.179 128.98 + <ACUTE ACCENT> 180 190 190 190 194.180 128.99 + <MICRO SIGN> 181 160 160 160 194.181 128.100 + <PARAGRAPH SIGN> 182 182 182 182 194.182 128.101 + <MIDDLE DOT> 183 179 179 179 194.183 128.102 + <CEDILLA> 184 157 157 157 194.184 128.103 + <SUPERSCRIPT ONE> 185 218 218 218 194.185 128.104 + <MASC. ORDINAL INDICATOR> 186 155 155 155 194.186 128.105 + <RIGHT POINTING GUILLEMET> 187 139 139 139 194.187 128.106 + <FRACTION ONE QUARTER> 188 183 183 183 194.188 128.112 + <FRACTION ONE HALF> 189 184 184 184 194.189 128.113 + <FRACTION THREE QUARTERS> 190 185 185 185 194.190 128.114 + <INVERTED QUESTION MARK> 191 171 171 171 194.191 128.115 + <A WITH GRAVE> 192 100 100 100 195.128 138.65 + <A WITH ACUTE> 193 101 101 101 195.129 138.66 + <A WITH CIRCUMFLEX> 194 98 98 98 195.130 138.67 + <A WITH TILDE> 195 102 102 102 195.131 138.68 + <A WITH DIAERESIS> 196 99 99 99 195.132 138.69 + <A WITH RING ABOVE> 197 103 103 103 195.133 138.70 + <CAPITAL LIGATURE AE> 198 158 158 158 195.134 138.71 + <C WITH CEDILLA> 199 104 104 104 195.135 138.72 + <E WITH GRAVE> 200 116 116 116 195.136 138.73 + <E WITH ACUTE> 201 113 113 113 195.137 138.74 + <E WITH CIRCUMFLEX> 202 114 114 114 195.138 138.81 + <E WITH DIAERESIS> 203 115 115 115 195.139 138.82 + <I WITH GRAVE> 204 120 120 120 195.140 138.83 + <I WITH ACUTE> 205 117 117 117 195.141 138.84 + <I WITH CIRCUMFLEX> 206 118 118 118 195.142 138.85 + <I WITH DIAERESIS> 207 119 119 119 195.143 138.86 + <CAPITAL LETTER ETH> 208 172 172 172 195.144 138.87 + <N WITH TILDE> 209 105 105 105 195.145 138.88 + <O WITH GRAVE> 210 237 237 237 195.146 138.89 + <O WITH ACUTE> 211 238 238 238 195.147 138.98 + <O WITH CIRCUMFLEX> 212 235 235 235 195.148 138.99 + <O WITH TILDE> 213 239 239 239 195.149 138.100 + <O WITH DIAERESIS> 214 236 236 236 195.150 138.101 + <MULTIPLICATION SIGN> 215 191 191 191 195.151 138.102 + <O WITH STROKE> 216 128 128 128 195.152 138.103 + <U WITH GRAVE> 217 253 253 224 195.153 138.104 ### + <U WITH ACUTE> 218 254 254 254 195.154 138.105 + <U WITH CIRCUMFLEX> 219 251 251 221 195.155 138.106 ### + <U WITH DIAERESIS> 220 252 252 252 195.156 138.112 + <Y WITH ACUTE> 221 173 186 173 195.157 138.113 *** ### + <CAPITAL LETTER THORN> 222 174 174 174 195.158 138.114 + <SMALL LETTER SHARP S> 223 89 89 89 195.159 138.115 + <a WITH GRAVE> 224 68 68 68 195.160 139.65 + <a WITH ACUTE> 225 69 69 69 195.161 139.66 + <a WITH CIRCUMFLEX> 226 66 66 66 195.162 139.67 + <a WITH TILDE> 227 70 70 70 195.163 139.68 + <a WITH DIAERESIS> 228 67 67 67 195.164 139.69 + <a WITH RING ABOVE> 229 71 71 71 195.165 139.70 + <SMALL LIGATURE ae> 230 156 156 156 195.166 139.71 + <c WITH CEDILLA> 231 72 72 72 195.167 139.72 + <e WITH GRAVE> 232 84 84 84 195.168 139.73 + <e WITH ACUTE> 233 81 81 81 195.169 139.74 + <e WITH CIRCUMFLEX> 234 82 82 82 195.170 139.81 + <e WITH DIAERESIS> 235 83 83 83 195.171 139.82 + <i WITH GRAVE> 236 88 88 88 195.172 139.83 + <i WITH ACUTE> 237 85 85 85 195.173 139.84 + <i WITH CIRCUMFLEX> 238 86 86 86 195.174 139.85 + <i WITH DIAERESIS> 239 87 87 87 195.175 139.86 + <SMALL LETTER eth> 240 140 140 140 195.176 139.87 + <n WITH TILDE> 241 73 73 73 195.177 139.88 + <o WITH GRAVE> 242 205 205 205 195.178 139.89 + <o WITH ACUTE> 243 206 206 206 195.179 139.98 + <o WITH CIRCUMFLEX> 244 203 203 203 195.180 139.99 + <o WITH TILDE> 245 207 207 207 195.181 139.100 + <o WITH DIAERESIS> 246 204 204 204 195.182 139.101 + <DIVISION SIGN> 247 225 225 225 195.183 139.102 + <o WITH STROKE> 248 112 112 112 195.184 139.103 + <u WITH GRAVE> 249 221 221 192 195.185 139.104 ### + <u WITH ACUTE> 250 222 222 222 195.186 139.105 + <u WITH CIRCUMFLEX> 251 219 219 219 195.187 139.106 + <u WITH DIAERESIS> 252 220 220 220 195.188 139.112 + <y WITH ACUTE> 253 141 141 141 195.189 139.113 + <SMALL LETTER thorn> 254 142 142 142 195.190 139.114 + <y WITH DIAERESIS> 255 223 223 223 195.191 139.115 If you would rather see the above table in CCSID 0037 order rather than ASCII + Latin-1 order then run the table through: @@ -585,14 +597,15 @@ ASCII + Latin-1 order then run the table through: =back - perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + perl \ + -ne 'if(/.{43}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ -e '{push(@l,$_)}' \ -e 'END{print map{$_->[0]}' \ -e ' sort{$a->[1] <=> $b->[1]}' \ - -e ' map{[$_,substr($_,42,3)]}@l;}' perlebcdic.pod + -e ' map{[$_,substr($_,52,3)]}@l;}' perlebcdic.pod -If you would rather see it in CCSID 1047 order then change the digit -42 in the last line to 51, like this: +If you would rather see it in CCSID 1047 order then change the number +52 in the last line to 61, like this: =over 4 @@ -600,14 +613,15 @@ If you would rather see it in CCSID 1047 order then change the digit =back - perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ - -e '{push(@l,$_)}' \ - -e 'END{print map{$_->[0]}' \ - -e ' sort{$a->[1] <=> $b->[1]}' \ - -e ' map{[$_,substr($_,51,3)]}@l;}' perlebcdic.pod + perl \ + -ne 'if(/.{43}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,61,3)]}@l;}' perlebcdic.pod -If you would rather see it in POSIX-BC order then change the digit -51 in the last line to 60, like this: +If you would rather see it in POSIX-BC order then change the number +61 in the last line to 70, like this: =over 4 @@ -615,11 +629,12 @@ If you would rather see it in POSIX-BC order then change the digit =back - perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + perl \ + -ne 'if(/.{43}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ -e '{push(@l,$_)}' \ -e 'END{print map{$_->[0]}' \ -e ' sort{$a->[1] <=> $b->[1]}' \ - -e ' map{[$_,substr($_,60,3)]}@l;}' perlebcdic.pod + -e ' map{[$_,substr($_,70,3)]}@l;}' perlebcdic.pod =head1 IDENTIFYING CHARACTER CODE SETS @@ -658,7 +673,7 @@ However, it would be unwise to write tests such as: Obviously the first of these will fail to distinguish most ASCII platforms from either a CCSID 0037, a 1047, or a POSIX-BC EBCDIC platform since "\r" eq chr(13) under all of those coded character sets. But note too that -because "\n" is chr(13) and "\r" is chr(10) on the MacIntosh (which is an +because "\n" is chr(13) and "\r" is chr(10) on the Macintosh (which is an ASCII platform) the second C<$is_ascii> test will lead to trouble there. To determine whether or not perl was built under an EBCDIC @@ -674,44 +689,49 @@ code page you can use the Config module like so: In order to convert a string of characters from one character set to another a simple list of numbers, such as in the right columns in the above table, along with perl's tr/// operator is all that is needed. -The data in the table are in ASCII order hence the EBCDIC columns -provide easy to use ASCII to EBCDIC operations that are also easily +The data in the table are in ASCII/Latin1 order, hence the EBCDIC columns +provide easy-to-use ASCII/Latin1 to EBCDIC operations that are also easily reversed. -For example, to convert ASCII to code page 037 take the output of the second -column from the output of recipe 0 (modified to add \\ characters) and use -it in tr/// like so: +For example, to convert ASCII/Latin1 to code page 037 take the output of the +second numbers column from the output of recipe 2 (modified to add '\' +characters) and use it in tr/// like so: $cp_037 = - '\000\001\002\003\234\011\206\177\227\215\216\013\014\015\016\017' . - '\020\021\022\023\235\205\010\207\030\031\222\217\034\035\036\037' . - '\200\201\202\203\204\012\027\033\210\211\212\213\214\005\006\007' . - '\220\221\026\223\224\225\226\004\230\231\232\233\024\025\236\032' . - '\040\240\342\344\340\341\343\345\347\361\242\056\074\050\053\174' . - '\046\351\352\353\350\355\356\357\354\337\041\044\052\051\073\254' . - '\055\057\302\304\300\301\303\305\307\321\246\054\045\137\076\077' . - '\370\311\312\313\310\315\316\317\314\140\072\043\100\047\075\042' . - '\330\141\142\143\144\145\146\147\150\151\253\273\360\375\376\261' . - '\260\152\153\154\155\156\157\160\161\162\252\272\346\270\306\244' . - '\265\176\163\164\165\166\167\170\171\172\241\277\320\335\336\256' . - '\136\243\245\267\251\247\266\274\275\276\133\135\257\250\264\327' . - '\173\101\102\103\104\105\106\107\110\111\255\364\366\362\363\365' . - '\175\112\113\114\115\116\117\120\121\122\271\373\374\371\372\377' . - '\134\367\123\124\125\126\127\130\131\132\262\324\326\322\323\325' . - '\060\061\062\063\064\065\066\067\070\071\263\333\334\331\332\237' ; + '\x00\x01\x02\x03\x37\x2D\x2E\x2F\x16\x05\x25\x0B\x0C\x0D\x0E\x0F' . + '\x10\x11\x12\x13\x3C\x3D\x32\x26\x18\x19\x3F\x27\x1C\x1D\x1E\x1F' . + '\x40\x5A\x7F\x7B\x5B\x6C\x50\x7D\x4D\x5D\x5C\x4E\x6B\x60\x4B\x61' . + '\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\x7A\x5E\x4C\x7E\x6E\x6F' . + '\x7C\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6' . + '\xD7\xD8\xD9\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xBA\xE0\xBB\xB0\x6D' . + '\x79\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91\x92\x93\x94\x95\x96' . + '\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xC0\x4F\xD0\xA1\x07' . + '\x20\x21\x22\x23\x24\x15\x06\x17\x28\x29\x2A\x2B\x2C\x09\x0A\x1B' . + '\x30\x31\x1A\x33\x34\x35\x36\x08\x38\x39\x3A\x3B\x04\x14\x3E\xFF' . + '\x41\xAA\x4A\xB1\x9F\xB2\x6A\xB5\xBD\xB4\x9A\x8A\x5F\xCA\xAF\xBC' . + '\x90\x8F\xEA\xFA\xBE\xA0\xB6\xB3\x9D\xDA\x9B\x8B\xB7\xB8\xB9\xAB' . + '\x64\x65\x62\x66\x63\x67\x9E\x68\x74\x71\x72\x73\x78\x75\x76\x77' . + '\xAC\x69\xED\xEE\xEB\xEF\xEC\xBF\x80\xFD\xFE\xFB\xFC\xAD\xAE\x59' . + '\x44\x45\x42\x46\x43\x47\x9C\x48\x54\x51\x52\x53\x58\x55\x56\x57' . + '\x8C\x49\xCD\xCE\xCB\xCF\xCC\xE1\x70\xDD\xDE\xDB\xDC\x8D\x8E\xDF'; my $ebcdic_string = $ascii_string; - eval '$ebcdic_string =~ tr/' . $cp_037 . '/\000-\377/'; + eval '$ebcdic_string =~ tr/\000-\377/' . $cp_037 . '/'; To convert from EBCDIC 037 to ASCII just reverse the order of the tr/// arguments like so: my $ascii_string = $ebcdic_string; - eval '$ascii_string =~ tr/\000-\377/' . $cp_037 . '/'; + eval '$ascii_string =~ tr/' . $cp_037 . '/\000-\377/'; + +Similarly one could take the output of the third numbers column from recipe 2 +to obtain a C<$cp_1047> table. The fourth numbers column of the output from +recipe 2 could provide a C<$cp_posix_bc> table suitable for transcoding as +well. -Similarly one could take the output of the third column from recipe 0 to -obtain a C<$cp_1047> table. The fourth column of the output from recipe -0 could provide a C<$cp_posix_bc> table suitable for transcoding as well. +If you wanted to see the inverse tables, you would first have to sort on the +desired numbers column as in recipes 4, 5 or 6, then take the output of the +first numbers column. =head2 iconv @@ -730,11 +750,11 @@ or the inverse map: # OS/390 or z/OS example $ebcdic_data = `echo '$ascii_data'| iconv -f ISO8859-1 -t IBM-1047` -For other perl based conversion options see the Convert::* modules on CPAN. +For other perl-based conversion options see the Convert::* modules on CPAN. =head2 C RTL -The OS/390 and z/OS C run time libraries provide _atoe() and _etoa() functions. +The OS/390 and z/OS C run-time libraries provide _atoe() and _etoa() functions. =head1 OPERATOR DIFFERENCES @@ -758,58 +778,55 @@ an example adapted from the one in L<perlop>: An interesting property of the 32 C0 control characters in the ASCII table is that they can "literally" be constructed -as control characters in perl, e.g. C<(chr(0) eq "\c@")> -C<(chr(1) eq "\cA")>, and so on. Perl on EBCDIC platforms has been -ported to take "\c@" to chr(0) and "\cA" to chr(1) as well, but the +as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)> +C<(chr(1)> eq C<\cA>)>, and so on. Perl on EBCDIC platforms has been +ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the thirty three characters that result depend on which code page you are -using. The table below uses the character names from the previous table -but with substitutions such as s/START OF/S.O./; s/END OF /E.O./; -s/TRANSMISSION/TRANS./; s/TABULATION/TAB./; s/VERTICAL/VERT./; -s/HORIZONTAL/HORIZ./; s/DEVICE CONTROL/D.C./; s/SEPARATOR/SEP./; -s/NEGATIVE ACKNOWLEDGE/NEG. ACK./;. The POSIX-BC and 1047 sets are +using. The table below uses the standard acronyms for the controls. +The POSIX-BC and 1047 sets are identical throughout this range and differ from the 0037 set at only one spot (21 decimal). Note that the C<LINE FEED> character -may be generated by "\cJ" on ASCII platforms but by "\cU" on 1047 or POSIX-BC +may be generated by C<\cJ> on ASCII platforms but by C<\cU> on 1047 or POSIX-BC platforms and cannot be generated as a C<"\c.letter."> control character on -0037 platforms. Note also that "\c\\" maps to two characters -not one. - - chr ord 8859-1 0037 1047 && POSIX-BC - ------------------------------------------------------------------------ - "\c?" 127 <DELETE> " " ***>< - "\c@" 0 <NULL> <NULL> <NULL> ***>< - "\cA" 1 <S.O. HEADING> <S.O. HEADING> <S.O. HEADING> - "\cB" 2 <S.O. TEXT> <S.O. TEXT> <S.O. TEXT> - "\cC" 3 <E.O. TEXT> <E.O. TEXT> <E.O. TEXT> - "\cD" 4 <E.O. TRANS.> <C1 28> <C1 28> - "\cE" 5 <ENQUIRY> <HORIZ. TAB.> <HORIZ. TAB.> - "\cF" 6 <ACKNOWLEDGE> <C1 6> <C1 6> - "\cG" 7 <BELL> <DELETE> <DELETE> - "\cH" 8 <BACKSPACE> <C1 23> <C1 23> - "\cI" 9 <HORIZ. TAB.> <C1 13> <C1 13> - "\cJ" 10 <LINE FEED> <C1 14> <C1 14> - "\cK" 11 <VERT. TAB.> <VERT. TAB.> <VERT. TAB.> - "\cL" 12 <FORM FEED> <FORM FEED> <FORM FEED> - "\cM" 13 <CARRIAGE RETURN> <CARRIAGE RETURN> <CARRIAGE RETURN> - "\cN" 14 <SHIFT OUT> <SHIFT OUT> <SHIFT OUT> - "\cO" 15 <SHIFT IN> <SHIFT IN> <SHIFT IN> - "\cP" 16 <DATA LINK ESCAPE> <DATA LINK ESCAPE> <DATA LINK ESCAPE> - "\cQ" 17 <D.C. ONE> <D.C. ONE> <D.C. ONE> - "\cR" 18 <D.C. TWO> <D.C. TWO> <D.C. TWO> - "\cS" 19 <D.C. THREE> <D.C. THREE> <D.C. THREE> - "\cT" 20 <D.C. FOUR> <C1 29> <C1 29> - "\cU" 21 <NEG. ACK.> <C1 5> <LINE FEED> *** - "\cV" 22 <SYNCHRONOUS IDLE> <BACKSPACE> <BACKSPACE> - "\cW" 23 <E.O. TRANS. BLOCK> <C1 7> <C1 7> - "\cX" 24 <CANCEL> <CANCEL> <CANCEL> - "\cY" 25 <E.O. MEDIUM> <E.O. MEDIUM> <E.O. MEDIUM> - "\cZ" 26 <SUBSTITUTE> <C1 18> <C1 18> - "\c[" 27 <ESCAPE> <C1 15> <C1 15> - "\c\\" 28 <FILE SEP.>\ <FILE SEP.>\ <FILE SEP.>\ - "\c]" 29 <GROUP SEP.> <GROUP SEP.> <GROUP SEP.> - "\c^" 30 <RECORD SEP.> <RECORD SEP.> <RECORD SEP.> ***>< - "\c_" 31 <UNIT SEP.> <UNIT SEP.> <UNIT SEP.> ***>< - +0037 platforms. Note also that C<\c\> cannot be the final element in a string +or regex, as it will absorb the terminator. But C<\c\I<X>> is a C<FILE +SEPARATOR> concatenated with I<X> for all I<X>. + + chr ord 8859-1 0037 1047 && POSIX-BC + ----------------------------------------------------------------------- + \c? 127 <DEL> " " + \c@ 0 <NUL> <NUL> <NUL> + \cA 1 <SOH> <SOH> <SOH> + \cB 2 <STX> <STX> <STX> + \cC 3 <ETX> <ETX> <ETX> + \cD 4 <EOT> <ST> <ST> + \cE 5 <ENQ> <HT> <HT> + \cF 6 <ACK> <SSA> <SSA> + \cG 7 <BEL> <DEL> <DEL> + \cH 8 <BS> <EPA> <EPA> + \cI 9 <HT> <RI> <RI> + \cJ 10 <LF> <SS2> <SS2> + \cK 11 <VT> <VT> <VT> + \cL 12 <FF> <FF> <FF> + \cM 13 <CR> <CR> <CR> + \cN 14 <SO> <SO> <SO> + \cO 15 <SI> <SI> <SI> + \cP 16 <DLE> <DLE> <DLE> + \cQ 17 <DC1> <DC1> <DC1> + \cR 18 <DC2> <DC2> <DC2> + \cS 19 <DC3> <DC3> <DC3> + \cT 20 <DC4> <OSC> <OSC> + \cU 21 <NAK> <NEL> <LF> *** + \cV 22 <SYN> <BS> <BS> + \cW 23 <ETB> <ESA> <ESA> + \cX 24 <CAN> <CAN> <CAN> + \cY 25 <EOM> <EOM> <EOM> + \cZ 26 <SUB> <PU2> <PU2> + \c[ 27 <ESC> <SS3> <SS3> + \c\X 28 <FS>X <FS>X <FS>X + \c] 29 <GS> <GS> <GS> + \c^ 30 <RS> <RS> <RS> + \c_ 31 <US> <US> <US> =head1 FUNCTION DIFFERENCES @@ -856,7 +873,7 @@ recommend something similar to: Under the IBM OS/390 USS Web Server or WebSphere on z/OS for example you should instead write that as: - print "Content-type:\ttext/html\r\n\r\n"; # OK for DGW et alia + print "Content-type:\ttext/html\r\n\r\n"; # OK for DGW et al That is because the translation from EBCDIC to ASCII is done by the web server in this case (such code will not be appropriate for @@ -891,7 +908,7 @@ See the discussion of pack() above. =head1 REGULAR EXPRESSION DIFFERENCES -As of perl 5.005_03 the letter range regular expression such as +As of perl 5.005_03 the letter range regular expressions such as [A-Z] and [a-z] have been especially coded to not pick up gap characters. For example, characters such as E<ocirc> C<o WITH CIRCUMFLEX> that lie between I and J would not be matched by the @@ -948,7 +965,7 @@ four coded character sets discussed in this document is as follows: if (ord('^')==94) { # ascii return $char =~ /[\000-\037]/; } - if (ord('^')==176) { # 37 + if (ord('^')==176) { # 0037 return $char =~ /[\000-\003\067\055-\057\026\005\045\013-\023\074\075\062\046\030\031\077\047\034-\037]/; } if (ord('^')==95 || ord('^')==106) { # 1047 || posix-bc @@ -976,7 +993,7 @@ four coded character sets discussed in this document is as follows: if (ord('^')==94) { # ascii return $char =~ /[\200-\237]/; } - if (ord('^')==176) { # 37 + if (ord('^')==176) { # 0037 return $char =~ /[\040-\044\025\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/; } if (ord('^')==95) { # 1047 @@ -993,7 +1010,7 @@ four coded character sets discussed in this document is as follows: if (ord('^')==94) { # ascii return $char =~ /[\240-\377]/; } - if (ord('^')==176) { # 37 + if (ord('^')==176) { # 0037 return $char =~ /[\101\252\112\261\237\262\152\265\275\264\232\212\137\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/; } @@ -1029,21 +1046,21 @@ output. =head1 SORTING -One big difference between ASCII based character sets and EBCDIC ones +One big difference between ASCII-based character sets and EBCDIC ones are the relative positions of upper and lower case letters and the -letters compared to the digits. If sorted on an ASCII based platform the -two letter abbreviation for a physician comes before the two letter -for drive, that is: +letters compared to the digits. If sorted on an ASCII-based platform the +two-letter abbreviation for a physician comes before the two letter +abbreviation for drive; that is: - @sorted = sort(qw(Dr. dr.)); # @sorted holds ('Dr.','dr.') on ASCII, + @sorted = sort(qw(Dr. dr.)); # @sorted holds ('Dr.','dr.') on ASCII, # but ('dr.','Dr.') on EBCDIC -The property of lower case before uppercase letters in EBCDIC is +The property of lowercase before uppercase letters in EBCDIC is even carried to the Latin 1 EBCDIC pages such as 0037 and 1047. An example would be that E<Euml> C<E WITH DIAERESIS> (203) comes before E<euml> C<e WITH DIAERESIS> (235) on an ASCII platform, but the latter (83) comes before the former (115) on an EBCDIC platform. -(Astute readers will note that the upper case version of E<szlig> +(Astute readers will note that the uppercase version of E<szlig> C<SMALL LETTER SHARP S> is simply "SS" and that the upper case version of E<yuml> C<y WITH DIAERESIS> is not in the 0..255 range but it is at U+x0178 in Unicode, or C<"\x{178}"> in a Unicode enabled Perl). @@ -1059,7 +1076,7 @@ some user education. =head2 MONO CASE then sort data. -In order to minimize the expense of mono casing mixed test try to +In order to minimize the expense of mono casing mixed-case text, try to C<tr///> towards the character set case most employed within the data. If the data are primarily UPPERCASE non Latin 1 then apply tr/[a-z]/[A-Z]/ then sort(). If the data are primarily lowercase non Latin 1 then @@ -1074,7 +1091,7 @@ then sort(). Do note however that such Latin-1 manipulation does not address the E<yuml> C<y WITH DIAERESIS> character that will remain at code point 255 on ASCII platforms, but 223 on most EBCDIC platforms where it will sort to a place less than the EBCDIC numerals. With a -Unicode enabled Perl you might try: +Unicode-enabled Perl you might try: tr/^?/\x{178}/; @@ -1217,7 +1234,7 @@ that the @e2a array is filled in appropriately: =head2 Quoted-Printable encoding and decoding -On ASCII encoded platforms it is possible to strip characters outside of +On ASCII-encoded platforms it is possible to strip characters outside of the printable set using: # This QP encoder works on ASCII only @@ -1255,14 +1272,14 @@ omitted for brevity): $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr $a2e[hex $1]/ge; $string =~ s/=[\n\r]+$//; -=head2 Caesarian ciphers +=head2 Caesarean ciphers The practice of shifting an alphabet one or more characters for encipherment dates back thousands of years and was explicitly detailed by Gaius Julius Caesar in his B<Gallic Wars> text. A single alphabet shift is sometimes referred to as a rotation and the shift amount is given as a number $n after the string 'rot' or "rot$n". Rot0 and rot26 would designate identity maps -on the 26 letter English version of the Latin alphabet. Rot13 has the +on the 26-letter English version of the Latin alphabet. Rot13 has the interesting property that alternate subsequent invocations are identity maps (thus rot13 is its own non-trivial inverse in the group of 26 alphabet rotations). Hence the following is a rot13 encoder and decoder that will @@ -1284,16 +1301,16 @@ In one-liner form: To the extent that it is possible to write code that depends on hashing order there may be differences between hashes as stored -on an ASCII based platform and hashes stored on an EBCDIC based platform. +on an ASCII-based platform and hashes stored on an EBCDIC-based platform. XXX =head1 I18N AND L10N -Internationalization(I18N) and localization(L10N) are supported at least -in principle even on EBCDIC platforms. The details are system dependent +Internationalization (I18N) and localization (L10N) are supported at least +in principle even on EBCDIC platforms. The details are system-dependent and discussed under the L<perlebcdic/OS ISSUES> section below. -=head1 MULTI OCTET CHARACTER SETS +=head1 MULTI-OCTET CHARACTER SETS Perl may work with an internal UTF-EBCDIC encoding form for wide characters on EBCDIC platforms in a manner analogous to the way that it works with @@ -1303,7 +1320,7 @@ Legacy multi byte EBCDIC code pages XXX. =head1 OS ISSUES -There may be a few system dependent issues +There may be a few system-dependent issues of concern to EBCDIC Perl programmers. =head2 OS/400 @@ -1312,8 +1329,8 @@ of concern to EBCDIC Perl programmers. =item PASE -The PASE environment is runtime environment for OS/400 that can run -executables built for PowerPC AIX in OS/400, see L<perlos400>. PASE +The PASE environment is a runtime environment for OS/400 that can run +executables built for PowerPC AIX in OS/400; see L<perlos400>. PASE is ASCII-based, not EBCDIC-based as the ILE. =item IFS access @@ -1331,7 +1348,7 @@ Perl runs under Unix Systems Services or USS. =item chcp B<chcp> is supported as a shell utility for displaying and changing -one's code page. See also L<chcp>. +one's code page. See also L<chcp(1)>. =item dataset access @@ -1427,5 +1444,3 @@ Thanks also to Vickie Cooper, Philip Newton, William Raffloer, and Joe Smith. Trademarks, registered trademarks, service marks and registered service marks used in this document are the property of their respective owners. - - diff --git a/gnu/usr.bin/perl/pod/perlexperiment.pod b/gnu/usr.bin/perl/pod/perlexperiment.pod new file mode 100644 index 00000000000..f304120bc66 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlexperiment.pod @@ -0,0 +1,305 @@ +=head1 NAME + +perlexperiment - A listing of experimental features in Perl + +=head1 DESCRIPTION + +This document lists the current and past experimental features in the perl +core. Although all of these are documented with their appropriate topics, +this succinct listing gives you an overview and basic facts about their +status. + +So far I've merely tried to find and list the experimental features and infer +their inception, versions, etc. There's a lot of speculation here. + +=head2 Current experiments + +=over 8 + +=item fork() emulation + +Introduced in Perl 5.6.1 + +See also L<perlfork> + +=item Weak references + +Introduced in Perl 5.6.0 + +=item Internal file glob + +Introduced in Perl 5.6.0 + +Accepted in XXX + +=item 64-bit support + +Introduced in Perl 5.005 + +Accepted in XXX + +=item die accepts a reference + +Introduced in Perl 5.005 + +Accepted in Perl XXX + +=item Unicode support + +Introduced in Perl 5.6.0 + +Accepted in Perl 5.8.0 XXX + +=item -Dusemultiplicity -Dusethreads + +Introduced in Perl 5.6.0 + +=item Long Doubles Still Don't Work In Solaris + +Introduced in Perl 5.7.0 + +=item GetOpt::Long Options can now take multiple values at once (experimental) + +C<Getopt::Long> upgraded to version 2.35 + +Removed in Perl 5.8.8 + +=item 5.005-style threading + +Introduced in Perl 5.005 + +Removed in Perl 5.10 XXX + +=item Test::Harness::Straps + +Removed in Perl 5.10.1 + +=item perlcc + +Introduced in Perl 5.005 + +Removed in Perl 5.9.0 + +=item C<our> can now have an experimental optional attribute C<unique> + +Introduced in Perl 5.8.0 + +Deprecated in Perl 5.10.0 + +=item Assertions + +The C<-A> command line switch + +Introduced in Perl 5.9.0 + +Removed in Perl 5.9.5 + +=item Linux abstract Unix domain sockets + +Introduced in Perl 5.9.2 + +See also L<Socket> + +=item L<Pod::HTML2Pod|Pod::HTML2Pod> + +=item L<Pod::PXML|Pod::PXML> + +=item threads + +=item The <:pop> IO pseudolayer + +See also L<perlrun> + +=item The <:win32> IO pseudolayer + +See also L<perlrun> + +=item MLDBM + +See also L<perldsc> + +=item internal functions with M flag + +See also L<perlguts> + +=item lex_start API + +Introduced in Perl 5.13.7 + +=item internal API for C<%H> + +Introduced in Perl 5.13.7 + +See also C<cophh_> in L<perlapi>. + +=item av_create_and_push + +=item av_create_and_unshift_one + +=item av_create_and_unshift_one + +=item PL_keyword_plugin + +=item hv_iternext_flags + +=item lex_bufutf8 + +=item lex_discard_to + +=item lex_grow_linestr + +=item lex_next_chunk + +=item lex_peek_unichar + +=item lex_read_space + +=item lex_read_to + +=item lex_read_unichar + +=item lex_stuff_pv + +=item lex_stuff_pvn + +=item lex_stuff_pvs + +=item lex_stuff_sv + +=item lex_unstuff + +=item parse_fullstmt + +=item parse_stmtseq + +=item PL_parser-E<gt>bufend + +=item PL_parser-E<gt>bufptr + +=item PL_parser-E<gt>linestart + +=item PL_parser-E<gt>linestr + +=item Perl_signbit + +=item pad_findmy + +=item sv_utf8_decode + +=item sv_utf8_downgrade + +=item bytes_from_utf8 + +=item bytes_to_utf8 + +=item utf8_to_bytes + +=item DB module + +Introduced in Perl 5.6.0 + +See also L<perldebug>, L<perldebtut> + +=item The pseudo-hash data type + +Introduced in Perl 5.6.0 + +=item Lvalue subroutines + +Introduced in Perl 5.6.0 + +See also L<perlsub> + +=item There is an C<installhtml> target in the Makefile. + +=item Unicode in Perl on EBCDIC + +=item C<(?{code})> + +See also L<perlre> + +=item C<(??{ code })> + +See also L<perlre> + +=item Backtracking control verbs + +C<(*ACCEPT)> + +Introduced in: Perl 5.10 + +See also: L<perlre/"Special Backtracking Control Verbs"> + +=item Code expressions, conditional expressions, and independent expressions in regexes + +=item The C<\N> regex character class + +The C<\N> character class, not to be confused with the named character +sequence C<\N{NAME}>, denotes any non-newline character in a regular +expression. + +Introduced in: Perl 5.12 + +See also: + +=item gv_try_downgrade + +See also L<perlintern> + +=item Experimental Support for Sun Studio Compilers for Linux OS + +See also L<perllinux> + +=item Pluggable keywords + +See L<perlapi/PL_keyword_plugin> for the mechanism. + +Introduced in: Perl 5.11.2 + +=back + +=head2 Accepted features + +These features were so wildly successful and played so well with others that +we decided to remove their experimental status and admit them as full, stable +features in the world of Perl, lavishing all the benefits and luxuries thereof. +They are also awarded +5 Stability and +3 Charisma. + +=over 8 + +=item (none yet identified) + +=back + +=head2 Removed features + +These features are no longer considered experimental and their functionality +has disappeared. It's your own fault if you wrote production programs using +these features after we explicitly told you not to (see L<perlpolicy>). + +=over 8 + +=item C<legacy> + +The experimental C<legacy> pragma was swallowed by the C<feature> pragma. + +Introduced in: 5.11.2 + +Removed in: 5.11.3 + +=back + +=head1 AUTHORS + +brian d foy C<< <brian.d.foy@gmail.com> >> + +=head1 COPYRIGHT + +Copyright 2010, brian d foy C<< <brian.d.foy@gmail.com> >> + +=head1 LICENSE + +You can use and redistribute this document under the same terms as Perl +itself. + +=cut diff --git a/gnu/usr.bin/perl/pod/perlfilter.pod b/gnu/usr.bin/perl/pod/perlfilter.pod index ca5cfd9fb2e..27061883c1f 100644 --- a/gnu/usr.bin/perl/pod/perlfilter.pod +++ b/gnu/usr.bin/perl/pod/perlfilter.pod @@ -217,7 +217,7 @@ difficult for the potential cracker. The most important: Write your decryption filter in C and statically link the decryption module into the Perl binary. For further tips to make life difficult for the potential cracker, see the file I<decrypt.pm> in the source filters -module. +distribution. =back @@ -234,7 +234,7 @@ The source filter distribution includes two modules that simplify this task: C<Filter::exec> and C<Filter::sh>. Both allow you to run any external executable. Both use a coprocess to control the flow of data into and out of the external executable. (For details on coprocesses, -see Stephens, W.R. "Advanced Programming in the UNIX Environment." +see Stephens, W.R., "Advanced Programming in the UNIX Environment." Addison-Wesley, ISBN 0-210-56317-7, pages 441-445.) The difference between them is that C<Filter::exec> spawns the external command directly, while C<Filter::sh> spawns a shell to execute the external @@ -388,9 +388,9 @@ Two special marker lines will bracket debugging code, like this: } ## DEBUG_END -When the C<DEBUG> environment variable exists, the filter ensures that -Perl parses only the code between the C<DEBUG_BEGIN> and C<DEBUG_END> -markers. That means that when C<DEBUG> does exist, the code above +The filter ensures that Perl parses the code between the <DEBUG_BEGIN> +and C<DEBUG_END> markers only when the C<DEBUG> environment variable +exists. That means that when C<DEBUG> does exist, the code above should be passed through the filter unchanged. The marker lines can also be passed through as-is, because the Perl parser will see them as comment lines. When C<DEBUG> isn't set, we need a way to disable the diff --git a/gnu/usr.bin/perl/pod/perlfork.pod b/gnu/usr.bin/perl/pod/perlfork.pod index 48d65ed4c58..7729444c40b 100644 --- a/gnu/usr.bin/perl/pod/perlfork.pod +++ b/gnu/usr.bin/perl/pod/perlfork.pod @@ -37,7 +37,7 @@ thread that implements this child "process" as the pseudo-process. To the Perl program that called fork(), all this is designed to be transparent. The parent returns from the fork() with a pseudo-process -ID that can be subsequently used in any process manipulation functions; +ID that can be subsequently used in any process-manipulation functions; the child returns from the fork() with a value of C<0> to signify that it is the child pseudo-process. @@ -77,12 +77,26 @@ and return its status. =item kill() -kill() can be used to terminate a pseudo-process by passing it the ID returned -by fork(). This should not be used except under dire circumstances, because -the operating system may not guarantee integrity of the process resources -when a running thread is terminated. Note that using kill() on a -pseudo-process() may typically cause memory leaks, because the thread that -implements the pseudo-process does not get a chance to clean up its resources. +C<kill('KILL', ...)> can be used to terminate a pseudo-process by +passing it the ID returned by fork(). The outcome of kill on a pseudo-process +is unpredictable and it should not be used except +under dire circumstances, because the operating system may not +guarantee integrity of the process resources when a running thread is +terminated. The process which implements the pseudo-processes can be blocked +and the Perl interpreter hangs. Note that using C<kill('KILL', ...)> on a +pseudo-process() may typically cause memory leaks, because the thread +that implements the pseudo-process does not get a chance to clean up +its resources. + +C<kill('TERM', ...)> can also be used on pseudo-processes, but the +signal will not be delivered while the pseudo-process is blocked by a +system call, e.g. waiting for a socket to connect, or trying to read +from a socket with no data available. Starting in Perl 5.14 the +parent process will not wait for children to exit once they have been +signalled with C<kill('TERM', ...)> to avoid deadlock during process +exit. You will have to explicitly call waitpid() to make sure the +child has time to clean-up itself, but you are then also responsible +that the child is not blocking on I/O either. =item exec() @@ -137,11 +151,12 @@ to complete before they exit. This means that the parent and every pseudo-child created by it that is also a pseudo-parent will only exit after their pseudo-children have exited. -A way to mark a pseudo-processes as running detached from their parent (so -that the parent would not have to wait() for them if it doesn't want to) -will be provided in future. +Starting with Perl 5.14 a parent will not wait() automatically +for any child that has been signalled with C<sig('TERM', ...)> +to avoid a deadlock in case the child is blocking on I/O and +never receives the signal. -=head2 CAVEATS AND LIMITATIONS +=head1 CAVEATS AND LIMITATIONS =over 8 @@ -184,9 +199,22 @@ On some operating systems, notably Solaris and Unixware, calling C<exit()> from a child process will flush and close open filehandles in the parent, thereby corrupting the filehandles. On these systems, calling C<_exit()> is suggested instead. C<_exit()> is available in Perl through the -C<POSIX> module. Please consult your systems manpages for more information +C<POSIX> module. Please consult your system's manpages for more information on this. +=item Open directory handles + +Perl will completely read from all open directory handles until they +reach the end of the stream. It will then seekdir() back to the +original location and all future readdir() requests will be fulfilled +from the cache buffer. That means that neither the directory handle held +by the parent process nor the one held by the child process will see +any changes made to the directory after the fork() call. + +Note that rewinddir() has a similar limitation on Windows and will not +force readdir() to read the directory again either. Only a newly +opened directory handle will reflect changes to the directory. + =item Forking pipe open() not yet implemented The C<open(FOO, "|-")> and C<open(BAR, "-|")> constructs are not yet @@ -281,6 +309,12 @@ are expected to be fixed for thread-safety. =back +=head1 PORTABILITY CAVEATS + +In portable Perl code, C<kill(9, $child)> must not be used on forked processes. +Killing a forked process is unsafe and has unpredictable results. +See L</kill()>, above. + =head1 BUGS =over 8 diff --git a/gnu/usr.bin/perl/pod/perlgit.pod b/gnu/usr.bin/perl/pod/perlgit.pod new file mode 100644 index 00000000000..1d2df2ed5e9 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlgit.pod @@ -0,0 +1,850 @@ +=encoding utf8 + +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlgit.pod + +=head1 NAME + +perlgit - Detailed information about git and the Perl repository + +=head1 DESCRIPTION + +This document provides details on using git to develop Perl. If you are +just interested in working on a quick patch, see L<perlhack> first. +This document is intended for people who are regular contributors to +Perl, including those with write access to the git repository. + +=head1 CLONING THE REPOSITORY + +All of Perl's source code is kept centrally in a Git repository at +I<perl5.git.perl.org>. + +You can make a read-only clone of the repository by running: + + % git clone git://perl5.git.perl.org/perl.git perl + +This uses the git protocol (port 9418). + +If you cannot use the git protocol for firewall reasons, you can also +clone via http, though this is much slower: + + % git clone http://perl5.git.perl.org/perl.git perl + +=head1 WORKING WITH THE REPOSITORY + +Once you have changed into the repository directory, you can inspect +it. After a clone the repository will contain a single local branch, +which will be the current branch as well, as indicated by the asterisk. + + % git branch + * blead + +Using the -a switch to C<branch> will also show the remote tracking +branches in the repository: + + % git branch -a + * blead + origin/HEAD + origin/blead + ... + +The branches that begin with "origin" correspond to the "git remote" +that you cloned from (which is named "origin"). Each branch on the +remote will be exactly tracked by these branches. You should NEVER do +work on these remote tracking branches. You only ever do work in a +local branch. Local branches can be configured to automerge (on pull) +from a designated remote tracking branch. This is the case with the +default branch C<blead> which will be configured to merge from the +remote tracking branch C<origin/blead>. + +You can see recent commits: + + % git log + +And pull new changes from the repository, and update your local +repository (must be clean first) + + % git pull + +Assuming we are on the branch C<blead> immediately after a pull, this +command would be more or less equivalent to: + + % git fetch + % git merge origin/blead + +In fact if you want to update your local repository without touching +your working directory you do: + + % git fetch + +And if you want to update your remote-tracking branches for all defined +remotes simultaneously you can do + + % git remote update + +Neither of these last two commands will update your working directory, +however both will update the remote-tracking branches in your +repository. + +To make a local branch of a remote branch: + + % git checkout -b maint-5.10 origin/maint-5.10 + +To switch back to blead: + + % git checkout blead + +=head2 Finding out your status + +The most common git command you will use will probably be + + % git status + +This command will produce as output a description of the current state +of the repository, including modified files and unignored untracked +files, and in addition it will show things like what files have been +staged for the next commit, and usually some useful information about +how to change things. For instance the following: + + $ git status + # On branch blead + # Your branch is ahead of 'origin/blead' by 1 commit. + # + # Changes to be committed: + # (use "git reset HEAD <file>..." to unstage) + # + # modified: pod/perlgit.pod + # + # Changed but not updated: + # (use "git add <file>..." to update what will be committed) + # + # modified: pod/perlgit.pod + # + # Untracked files: + # (use "git add <file>..." to include in what will be committed) + # + # deliberate.untracked + +This shows that there were changes to this document staged for commit, +and that there were further changes in the working directory not yet +staged. It also shows that there was an untracked file in the working +directory, and as you can see shows how to change all of this. It also +shows that there is one commit on the working branch C<blead> which has +not been pushed to the C<origin> remote yet. B<NOTE>: that this output +is also what you see as a template if you do not provide a message to +C<git commit>. + +=head2 Patch workflow + +First, please read L<perlhack> for details on hacking the Perl core. +That document covers many details on how to create a good patch. + +If you already have a Perl repository, you should ensure that you're on +the I<blead> branch, and your repository is up to date: + + % git checkout blead + % git pull + +It's preferable to patch against the latest blead version, since this +is where new development occurs for all changes other than critical bug +fixes. Critical bug fix patches should be made against the relevant +maint branches, or should be submitted with a note indicating all the +branches where the fix should be applied. + +Now that we have everything up to date, we need to create a temporary +new branch for these changes and switch into it: + + % git checkout -b orange + +which is the short form of + + % git branch orange + % git checkout orange + +Creating a topic branch makes it easier for the maintainers to rebase +or merge back into the master blead for a more linear history. If you +don't work on a topic branch the maintainer has to manually cherry pick +your changes onto blead before they can be applied. + +That'll get you scolded on perl5-porters, so don't do that. Be Awesome. + +Then make your changes. For example, if Leon Brocard changes his name +to Orange Brocard, we should change his name in the AUTHORS file: + + % perl -pi -e 's{Leon Brocard}{Orange Brocard}' AUTHORS + +You can see what files are changed: + + % git status + # On branch orange + # Changes to be committed: + # (use "git reset HEAD <file>..." to unstage) + # + # modified: AUTHORS + # + +And you can see the changes: + + % git diff + diff --git a/AUTHORS b/AUTHORS + index 293dd70..722c93e 100644 + --- a/AUTHORS + +++ b/AUTHORS + @@ -541,7 +541,7 @@ Lars Hecking <lhecking@nmrc.ucc.ie> + Laszlo Molnar <laszlo.molnar@eth.ericsson.se> + Leif Huhn <leif@hale.dkstat.com> + Len Johnson <lenjay@ibm.net> + -Leon Brocard <acme@astray.com> + +Orange Brocard <acme@astray.com> + Les Peters <lpeters@aol.net> + Lesley Binks <lesley.binks@gmail.com> + Lincoln D. Stein <lstein@cshl.org> + +Now commit your change locally: + + % git commit -a -m 'Rename Leon Brocard to Orange Brocard' + Created commit 6196c1d: Rename Leon Brocard to Orange Brocard + 1 files changed, 1 insertions(+), 1 deletions(-) + +The C<-a> option is used to include all files that git tracks that you +have changed. If at this time, you only want to commit some of the +files you have worked on, you can omit the C<-a> and use the command +C<S<git add I<FILE ...>>> before doing the commit. C<S<git add +--interactive>> allows you to even just commit portions of files +instead of all the changes in them. + +The C<-m> option is used to specify the commit message. If you omit it, +git will open a text editor for you to compose the message +interactively. This is useful when the changes are more complex than +the sample given here, and, depending on the editor, to know that the +first line of the commit message doesn't exceed the 50 character legal +maximum. + +Once you've finished writing your commit message and exited your +editor, git will write your change to disk and tell you something like +this: + + Created commit daf8e63: explain git status and stuff about remotes + 1 files changed, 83 insertions(+), 3 deletions(-) + +If you re-run C<git status>, you should see something like this: + + % git status + # On branch blead + # Your branch is ahead of 'origin/blead' by 2 commits. + # + # Untracked files: + # (use "git add <file>..." to include in what will be committed) + # + # deliberate.untracked + nothing added to commit but untracked files present (use "git add" to track) + +When in doubt, before you do anything else, check your status and read +it carefully, many questions are answered directly by the git status +output. + +You can examine your last commit with: + + % git show HEAD + +and if you are not happy with either the description or the patch +itself you can fix it up by editing the files once more and then issue: + + % git commit -a --amend + +Now you should create a patch file for all your local changes: + + % git format-patch -M origin.. + 0001-Rename-Leon-Brocard-to-Orange-Brocard.patch + +You should now send an email to +L<perlbug@perl.org|mailto:perlbug@perl.org> with a description of your +changes, and include this patch file as an attachment. In addition to +being tracked by RT, mail to perlbug will automatically be forwarded to +perl5-porters (with manual moderation, so please be patient). You +should only send patches to +L<perl5-porters@perl.org|mailto:perl5-porters@perl.org> directly if the +patch is not ready to be applied, but intended for discussion. + +See the next section for how to configure and use git to send these +emails for you. + +If you want to delete your temporary branch, you may do so with: + + % git checkout blead + % git branch -d orange + error: The branch 'orange' is not an ancestor of your current HEAD. + If you are sure you want to delete it, run 'git branch -D orange'. + % git branch -D orange + Deleted branch orange. + +=head2 Committing your changes + +Assuming that you'd like to commit all the changes you've made as a +single atomic unit, run this command: + + % git commit -a + +(That C<-a> tells git to add every file you've changed to this commit. +New files aren't automatically added to your commit when you use +C<commit -a> If you want to add files or to commit some, but not all of +your changes, have a look at the documentation for C<git add>.) + +Git will start up your favorite text editor, so that you can craft a +commit message for your change. See L<perlhack/Commit message> for more +information about what makes a good commit message. + +Once you've finished writing your commit message and exited your +editor, git will write your change to disk and tell you something like +this: + + Created commit daf8e63: explain git status and stuff about remotes + 1 files changed, 83 insertions(+), 3 deletions(-) + +If you re-run C<git status>, you should see something like this: + + % git status + # On branch blead + # Your branch is ahead of 'origin/blead' by 2 commits. + # + # Untracked files: + # (use "git add <file>..." to include in what will be committed) + # + # deliberate.untracked + nothing added to commit but untracked files present (use "git add" to track) + +When in doubt, before you do anything else, check your status and read +it carefully, many questions are answered directly by the git status +output. + +=head2 Using git to send patch emails + +Please read L<perlhack> first in order to figure out where your patches +should be sent. + +In your ~/git/perl repository, set the destination email to perl's bug +tracker: + + $ git config sendemail.to perlbug@perl.org + +Or maybe perl5-porters: + + $ git config sendemail.to perl5-porters@perl.org + +Then you can use git directly to send your patch emails: + + $ git send-email 0001-Rename-Leon-Brocard-to-Orange-Brocard.patch + +You may need to set some configuration variables for your particular +email service provider. For example, to set your global git config to +send email via a gmail account: + + $ git config --global sendemail.smtpserver smtp.gmail.com + $ git config --global sendemail.smtpssl 1 + $ git config --global sendemail.smtpuser YOURUSERNAME@gmail.com + +With this configuration, you will be prompted for your gmail password +when you run 'git send-email'. You can also configure +C<sendemail.smtppass> with your password if you don't care about having +your password in the .gitconfig file. + +=head2 A note on derived files + +Be aware that many files in the distribution are derivative--avoid +patching them, because git won't see the changes to them, and the build +process will overwrite them. Patch the originals instead. Most +utilities (like perldoc) are in this category, i.e. patch +F<utils/perldoc.PL> rather than F<utils/perldoc>. Similarly, don't +create patches for files under $src_root/ext from their copies found in +$install_root/lib. If you are unsure about the proper location of a +file that may have gotten copied while building the source +distribution, consult the C<MANIFEST>. + +=head2 Cleaning a working directory + +The command C<git clean> can with varying arguments be used as a +replacement for C<make clean>. + +To reset your working directory to a pristine condition you can do: + + % git clean -dxf + +However, be aware this will delete ALL untracked content. You can use + + % git clean -Xf + +to remove all ignored untracked files, such as build and test +byproduct, but leave any manually created files alone. + +If you only want to cancel some uncommitted edits, you can use C<git +checkout> and give it a list of files to be reverted, or C<git checkout +-f> to revert them all. + +If you want to cancel one or several commits, you can use C<git reset>. + +=head2 Bisecting + +C<git> provides a built-in way to determine which commit should be blamed +for introducing a given bug. C<git bisect> performs a binary search of +history to locate the first failing commit. It is fast, powerful and +flexible, but requires some setup and to automate the process an auxiliary +shell script is needed. + +The core provides a wrapper program, F<Porting/bisect.pl>, which attempts to +simplify as much as possible, making bisecting as simple as running a Perl +one-liner. For example, if you want to know when this became an error: + + perl -e 'my $a := 2' + +you simply run this: + + .../Porting/bisect.pl -e 'my $a := 2;' + +Using C<bisect.pl>, with one command (and no other files) it's easy to find +out + +=over 4 + +=item * + +Which commit caused this example code to break? + +=item * + +Which commit caused this example code to start working? + +=item * + +Which commit added the first file to match this regex? + +=item * + +Which commit removed the last file to match this regex? + +=back + +usually without needing to know which versions of perl to use as start and +end revisions, as F<bisect.pl> automatically searches to find the earliest +stable version for which the test case passes. Run +C<Porting/bisect.pl --help> for the full documentation, including how to +set the C<Configure> and build time options. + +If you require more flexibility than F<Porting/bisect.pl> has to offer, you'll +need to run C<git bisect> yourself. It's most useful to use C<git bisect run> +to automate the building and testing of perl revisions. For this you'll need +a shell script for C<git> to call to test a particular revision. An example +script is F<Porting/bisect-example.sh>, which you should copy B<outside> of +the repository, as the bisect process will reset the state to a clean checkout +as it runs. The instructions below assume that you copied it as F<~/run> and +then edited it as appropriate. + +You first enter in bisect mode with: + + % git bisect start + +For example, if the bug is present on C<HEAD> but wasn't in 5.10.0, +C<git> will learn about this when you enter: + + % git bisect bad + % git bisect good perl-5.10.0 + Bisecting: 853 revisions left to test after this + +This results in checking out the median commit between C<HEAD> and +C<perl-5.10.0>. You can then run the bisecting process with: + + % git bisect run ~/run + +When the first bad commit is isolated, C<git bisect> will tell you so: + + ca4cfd28534303b82a216cfe83a1c80cbc3b9dc5 is first bad commit + commit ca4cfd28534303b82a216cfe83a1c80cbc3b9dc5 + Author: Dave Mitchell <davem@fdisolutions.com> + Date: Sat Feb 9 14:56:23 2008 +0000 + + [perl #49472] Attributes + Unknown Error + ... + + bisect run success + +You can peek into the bisecting process with C<git bisect log> and +C<git bisect visualize>. C<git bisect reset> will get you out of bisect +mode. + +Please note that the first C<good> state must be an ancestor of the +first C<bad> state. If you want to search for the commit that I<solved> +some bug, you have to negate your test case (i.e. exit with C<1> if OK +and C<0> if not) and still mark the lower bound as C<good> and the +upper as C<bad>. The "first bad commit" has then to be understood as +the "first commit where the bug is solved". + +C<git help bisect> has much more information on how you can tweak your +binary searches. + +=head1 Topic branches and rewriting history + +Individual committers should create topic branches under +B<yourname>/B<some_descriptive_name>. Other committers should check +with a topic branch's creator before making any change to it. + +The simplest way to create a remote topic branch that works on all +versions of git is to push the current head as a new branch on the +remote, then check it out locally: + + $ branch="$yourname/$some_descriptive_name" + $ git push origin HEAD:$branch + $ git checkout -b $branch origin/$branch + +Users of git 1.7 or newer can do it in a more obvious manner: + + $ branch="$yourname/$some_descriptive_name" + $ git checkout -b $branch + $ git push origin -u $branch + +If you are not the creator of B<yourname>/B<some_descriptive_name>, you +might sometimes find that the original author has edited the branch's +history. There are lots of good reasons for this. Sometimes, an author +might simply be rebasing the branch onto a newer source point. +Sometimes, an author might have found an error in an early commit which +they wanted to fix before merging the branch to blead. + +Currently the master repository is configured to forbid +non-fast-forward merges. This means that the branches within can not be +rebased and pushed as a single step. + +The only way you will ever be allowed to rebase or modify the history +of a pushed branch is to delete it and push it as a new branch under +the same name. Please think carefully about doing this. It may be +better to sequentially rename your branches so that it is easier for +others working with you to cherry-pick their local changes onto the new +version. (XXX: needs explanation). + +If you want to rebase a personal topic branch, you will have to delete +your existing topic branch and push as a new version of it. You can do +this via the following formula (see the explanation about C<refspec>'s +in the git push documentation for details) after you have rebased your +branch: + + # first rebase + $ git checkout $user/$topic + $ git fetch + $ git rebase origin/blead + + # then "delete-and-push" + $ git push origin :$user/$topic + $ git push origin $user/$topic + +B<NOTE:> it is forbidden at the repository level to delete any of the +"primary" branches. That is any branch matching +C<m!^(blead|maint|perl)!>. Any attempt to do so will result in git +producing an error like this: + + $ git push origin :blead + *** It is forbidden to delete blead/maint branches in this repository + error: hooks/update exited with error code 1 + error: hook declined to update refs/heads/blead + To ssh://perl5.git.perl.org/perl + ! [remote rejected] blead (hook declined) + error: failed to push some refs to 'ssh://perl5.git.perl.org/perl' + +As a matter of policy we do B<not> edit the history of the blead and +maint-* branches. If a typo (or worse) sneaks into a commit to blead or +maint-*, we'll fix it in another commit. The only types of updates +allowed on these branches are "fast-forward's", where all history is +preserved. + +Annotated tags in the canonical perl.git repository will never be +deleted or modified. Think long and hard about whether you want to push +a local tag to perl.git before doing so. (Pushing unannotated tags is +not allowed.) + +=head2 Grafts + +The perl history contains one mistake which was not caught in the +conversion: a merge was recorded in the history between blead and +maint-5.10 where no merge actually occurred. Due to the nature of git, +this is now impossible to fix in the public repository. You can remove +this mis-merge locally by adding the following line to your +C<.git/info/grafts> file: + + 296f12bbbbaa06de9be9d09d3dcf8f4528898a49 434946e0cb7a32589ed92d18008aaa1d88515930 + +It is particularly important to have this graft line if any bisecting +is done in the area of the "merge" in question. + +=head1 WRITE ACCESS TO THE GIT REPOSITORY + +Once you have write access, you will need to modify the URL for the +origin remote to enable pushing. Edit F<.git/config> with the +git-config(1) command: + + % git config remote.origin.url ssh://perl5.git.perl.org/perl.git + +You can also set up your user name and e-mail address. Most people do +this once globally in their F<~/.gitconfig> by doing something like: + + % git config --global user.name "Ævar Arnfjörð Bjarmason" + % git config --global user.email avarab@gmail.com + +However if you'd like to override that just for perl then execute then +execute something like the following in F<perl>: + + % git config user.email avar@cpan.org + +It is also possible to keep C<origin> as a git remote, and add a new +remote for ssh access: + + % git remote add camel perl5.git.perl.org:/perl.git + +This allows you to update your local repository by pulling from +C<origin>, which is faster and doesn't require you to authenticate, and +to push your changes back with the C<camel> remote: + + % git fetch camel + % git push camel + +The C<fetch> command just updates the C<camel> refs, as the objects +themselves should have been fetched when pulling from C<origin>. + +=head1 Accepting a patch + +If you have received a patch file generated using the above section, +you should try out the patch. + +First we need to create a temporary new branch for these changes and +switch into it: + + % git checkout -b experimental + +Patches that were formatted by C<git format-patch> are applied with +C<git am>: + + % git am 0001-Rename-Leon-Brocard-to-Orange-Brocard.patch + Applying Rename Leon Brocard to Orange Brocard + +If just a raw diff is provided, it is also possible use this two-step +process: + + % git apply bugfix.diff + % git commit -a -m "Some fixing" --author="That Guy <that.guy@internets.com>" + +Now we can inspect the change: + + % git show HEAD + commit b1b3dab48344cff6de4087efca3dbd63548ab5e2 + Author: Leon Brocard <acme@astray.com> + Date: Fri Dec 19 17:02:59 2008 +0000 + + Rename Leon Brocard to Orange Brocard + + diff --git a/AUTHORS b/AUTHORS + index 293dd70..722c93e 100644 + --- a/AUTHORS + +++ b/AUTHORS + @@ -541,7 +541,7 @@ Lars Hecking <lhecking@nmrc.ucc.ie> + Laszlo Molnar <laszlo.molnar@eth.ericsson.se> + Leif Huhn <leif@hale.dkstat.com> + Len Johnson <lenjay@ibm.net> + -Leon Brocard <acme@astray.com> + +Orange Brocard <acme@astray.com> + Les Peters <lpeters@aol.net> + Lesley Binks <lesley.binks@gmail.com> + Lincoln D. Stein <lstein@cshl.org> + +If you are a committer to Perl and you think the patch is good, you can +then merge it into blead then push it out to the main repository: + + % git checkout blead + % git merge experimental + % git push + +If you want to delete your temporary branch, you may do so with: + + % git checkout blead + % git branch -d experimental + error: The branch 'experimental' is not an ancestor of your current HEAD. + If you are sure you want to delete it, run 'git branch -D experimental'. + % git branch -D experimental + Deleted branch experimental. + +=head2 Committing to blead + +The 'blead' branch will become the next production release of Perl. + +Before pushing I<any> local change to blead, it's incredibly important +that you do a few things, lest other committers come after you with +pitchforks and torches: + +=over + +=item * + +Make sure you have a good commit message. See L<perlhack/Commit +message> for details. + +=item * + +Run the test suite. You might not think that one typo fix would break a +test file. You'd be wrong. Here's an example of where not running the +suite caused problems. A patch was submitted that added a couple of +tests to an existing .t. It couldn't possibly affect anything else, so +no need to test beyond the single affected .t, right? But, the +submitter's email address had changed since the last of their +submissions, and this caused other tests to fail. Running the test +target given in the next item would have caught this problem. + +=item * + +If you don't run the full test suite, at least C<make test_porting>. +This will run basic sanity checks. To see which sanity checks, have a +look in F<t/porting>. + +=item * + +If you make any changes that affect miniperl or core routines that have +different code paths for miniperl, be sure to run C<make minitest>. +This will catch problems that even the full test suite will not catch +because it runs a subset of tests under miniperl rather than perl. + +=back + +=head3 On merging and rebasing + +Simple, one-off commits pushed to the 'blead' branch should be simple +commits that apply cleanly. In other words, you should make sure your +work is committed against the current position of blead, so that you can +push back to the master repository without merging. + +Sometimes, blead will move while you're building or testing your +changes. When this happens, your push will be rejected with a message +like this: + + To ssh://perl5.git.perl.org/perl.git + ! [rejected] blead -> blead (non-fast-forward) + error: failed to push some refs to 'ssh://perl5.git.perl.org/perl.git' + To prevent you from losing history, non-fast-forward updates were rejected + Merge the remote changes (e.g. 'git pull') before pushing again. See the + 'Note about fast-forwards' section of 'git push --help' for details. + +When this happens, you can just I<rebase> your work against the new +position of blead, like this (assuming your remote for the master +repository is "p5p"): + + $ git fetch p5p + $ git rebase p5p/blead + +You will see your commits being re-applied, and you will then be able to +push safely. More information about rebasing can be found in the +documentation for the git-rebase(1) command. + +For larger sets of commits that only make sense together, or that would +benefit from a summary of the set's purpose, you should use a merge +commit. You should perform your work on a L<topic branch|/Topic +branches and rewriting history>, which you should regularly rebase +against blead to ensure that your code is not broken by blead moving. +When you have finished your work, please perform a final rebase and +test. Linear history is something that gets lost with every +commit on blead, but a final rebase makes the history linear +again, making it easier for future maintainers to see what has +happened. Rebase as follows (assuming your work was on the +branch C<< committer/somework >>): + + $ git checkout committer/somework + $ git rebase blead + +Then you can merge it into master like this: + + $ git checkout blead + $ git merge --no-ff --no-commit committer/somework + $ git commit -a + +The switches above deserve explanation. C<--no-ff> indicates that even +if all your work can be applied linearly against blead, a merge commit +should still be prepared. This ensures that all your work will be shown +as a side branch, with all its commits merged into the mainstream blead +by the merge commit. + +C<--no-commit> means that the merge commit will be I<prepared> but not +I<committed>. The commit is then actually performed when you run the +next command, which will bring up your editor to describe the commit. +Without C<--no-commit>, the commit would be made with nearly no useful +message, which would greatly diminish the value of the merge commit as a +placeholder for the work's description. + +When describing the merge commit, explain the purpose of the branch, and +keep in mind that this description will probably be used by the +eventual release engineer when reviewing the next perldelta document. + +=head2 Committing to maintenance versions + +Maintenance versions should only be altered to add critical bug fixes, +see L<perlpolicy>. + +To commit to a maintenance version of perl, you need to create a local +tracking branch: + + % git checkout --track -b maint-5.005 origin/maint-5.005 + +This creates a local branch named C<maint-5.005>, which tracks the +remote branch C<origin/maint-5.005>. Then you can pull, commit, merge +and push as before. + +You can also cherry-pick commits from blead and another branch, by +using the C<git cherry-pick> command. It is recommended to use the +B<-x> option to C<git cherry-pick> in order to record the SHA1 of the +original commit in the new commit message. + +Before pushing any change to a maint version, make sure you've +satisfied the steps in L</Committing to blead> above. + +=head2 Merging from a branch via GitHub + +While we don't encourage the submission of patches via GitHub, that +will still happen. Here is a guide to merging patches from a GitHub +repository. + + % git remote add avar git://github.com/avar/perl.git + % git fetch avar + +Now you can see the differences between the branch and blead: + + % git diff avar/orange + +And you can see the commits: + + % git log avar/orange + +If you approve of a specific commit, you can cherry pick it: + + % git cherry-pick 0c24b290ae02b2ab3304f51d5e11e85eb3659eae + +Or you could just merge the whole branch if you like it all: + + % git merge avar/orange + +And then push back to the repository: + + % git push + +=head2 A note on camel and dromedary + +The committers have SSH access to the two servers that serve +C<perl5.git.perl.org>. One is C<perl5.git.perl.org> itself (I<camel>), +which is the 'master' repository. The second one is +C<users.perl5.git.perl.org> (I<dromedary>), which can be used for +general testing and development. Dromedary syncs the git tree from +camel every few minutes, you should not push there. Both machines also +have a full CPAN mirror in /srv/CPAN, please use this. To share files +with the general public, dromedary serves your ~/public_html/ as +C<http://users.perl5.git.perl.org/~yourlogin/> + +These hosts have fairly strict firewalls to the outside. Outgoing, only +rsync, ssh and git are allowed. For http and ftp, you can use +http://webproxy:3128 as proxy. Incoming, the firewall tries to detect +attacks and blocks IP addresses with suspicious activity. This +sometimes (but very rarely) has false positives and you might get +blocked. The quickest way to get unblocked is to notify the admins. + +These two boxes are owned, hosted, and operated by booking.com. You can +reach the sysadmins in #p5p on irc.perl.org or via mail to +C<perl5-porters@perl.org>. diff --git a/gnu/usr.bin/perl/pod/perlgpl.pod b/gnu/usr.bin/perl/pod/perlgpl.pod index de1791a95a8..82a8f5a9dd1 100644 --- a/gnu/usr.bin/perl/pod/perlgpl.pod +++ b/gnu/usr.bin/perl/pod/perlgpl.pod @@ -37,7 +37,8 @@ For the Perl Artistic License, see L<perlartistic>. Version 1, February 1989 Copyright (C) 1989 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. @@ -250,8 +251,10 @@ For the Perl Artistic License, see L<perlartistic>. GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software Foundation, - Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA + Also add information on how to contact you by electronic and paper mail. @@ -259,13 +262,13 @@ For the Perl Artistic License, see L<perlartistic>. when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19xx name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type 'show w'. This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. + under certain conditions; type 'show c' for details. - The hypothetical commands `show w' and `show c' should show the + The hypothetical commands 'show w' and 'show c' should show the appropriate parts of the General Public License. Of course, the - commands you use may be called something other than `show w' and `show + commands you use may be called something other than 'show w' and 'show c'; they could even be mouse-clicks or menu items--whatever suits your program. @@ -274,7 +277,7 @@ For the Perl Artistic License, see L<perlartistic>. necessary. Here a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the - program `Gnomovision' (a program to direct compilers to make passes + program 'Gnomovision' (a program to direct compilers to make passes at assemblers) written by James Hacker. <signature of Ty Coon>, 1 April 1989 @@ -283,5 +286,3 @@ For the Perl Artistic License, see L<perlartistic>. That's all there is to it! =cut - - diff --git a/gnu/usr.bin/perl/pod/perlhack.pod b/gnu/usr.bin/perl/pod/perlhack.pod index f4dac2ce510..63df5d5dfc3 100644 --- a/gnu/usr.bin/perl/pod/perlhack.pod +++ b/gnu/usr.bin/perl/pod/perlhack.pod @@ -1,1988 +1,854 @@ -=head1 NAME +=encoding utf8 -perlhack - How to hack at the Perl internals +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlhack.pod -=head1 DESCRIPTION +=head1 NAME -This document attempts to explain how Perl development takes place, -and ends with some suggestions for people wanting to become bona fide -porters. +perlhack - How to hack on Perl -The perl5-porters mailing list is where the Perl standard distribution -is maintained and developed. The list can get anywhere from 10 to 150 -messages a day, depending on the heatedness of the debate. Most days -there are two or three patches, extensions, features, or bugs being -discussed at a time. +=head1 DESCRIPTION -A searchable archive of the list is at either: +This document explains how Perl development works. It includes details +about the Perl 5 Porters email list, the Perl repository, the Perlbug +bug tracker, patch guidelines, and commentary on Perl development +philosophy. - http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/ +=head1 SUPER QUICK PATCH GUIDE -or - - http://archive.develooper.com/perl5-porters@perl.org/ - -List subscribers (the porters themselves) come in several flavours. -Some are quiet curious lurkers, who rarely pitch in and instead watch -the ongoing development to ensure they're forewarned of new changes or -features in Perl. Some are representatives of vendors, who are there -to make sure that Perl continues to compile and work on their -platforms. Some patch any reported bug that they know how to fix, -some are actively patching their pet area (threads, Win32, the regexp -engine), while others seem to do nothing but complain. In other -words, it's your usual mix of technical people. - -Over this group of porters presides Larry Wall. He has the final word -in what does and does not change in the Perl language. Various -releases of Perl are shepherded by a "pumpking", a porter -responsible for gathering patches, deciding on a patch-by-patch, -feature-by-feature basis what will and will not go into the release. -For instance, Gurusamy Sarathy was the pumpking for the 5.6 release of -Perl, and Jarkko Hietaniemi was the pumpking for the 5.8 release, and -Rafael Garcia-Suarez holds the pumpking crown for the 5.10 release. - -In addition, various people are pumpkings for different things. For -instance, Andy Dougherty and Jarkko Hietaniemi did a grand job as the -I<Configure> pumpkin up till the 5.8 release. For the 5.10 release -H.Merijn Brand took over. - -Larry sees Perl development along the lines of the US government: -there's the Legislature (the porters), the Executive branch (the -pumpkings), and the Supreme Court (Larry). The legislature can -discuss and submit patches to the executive branch all they like, but -the executive branch is free to veto them. Rarely, the Supreme Court -will side with the executive branch over the legislature, or the -legislature over the executive branch. Mostly, however, the -legislature and the executive branch are supposed to get along and -work out their differences without impeachment or court cases. - -You might sometimes see reference to Rule 1 and Rule 2. Larry's power -as Supreme Court is expressed in The Rules: +If you just want to submit a single small patch like a pod fix, a test +for a bug, comment fixes, etc., it's easy! Here's how: =over 4 -=item 1 - -Larry is always by definition right about how Perl should behave. -This means he has final veto power on the core functionality. +=item * Check out the source repository -=item 2 +The perl source is in a git repository. You can clone the repository +with the following command: -Larry is allowed to change his mind about any matter at a later date, -regardless of whether he previously invoked Rule 1. + % git clone git://perl5.git.perl.org/perl.git perl -=back +=item * Make your change -Got that? Larry is always right, even when he was wrong. It's rare -to see either Rule exercised, but they are often alluded to. +Hack, hack, hack. -New features and extensions to the language are contentious, because -the criteria used by the pumpkings, Larry, and other porters to decide -which features should be implemented and incorporated are not codified -in a few small design goals as with some other languages. Instead, -the heuristics are flexible and often difficult to fathom. Here is -one person's list, roughly in decreasing order of importance, of -heuristics that new features have to be weighed against: +=item * Test your change -=over 4 +You can run all the tests with the following commands: -=item Does concept match the general goals of Perl? + % ./Configure -des -Dusedevel + % make test -These haven't been written anywhere in stone, but one approximation -is: +Keep hacking until the tests pass. - 1. Keep it fast, simple, and useful. - 2. Keep features/concepts as orthogonal as possible. - 3. No arbitrary limits (platforms, data sizes, cultures). - 4. Keep it open and exciting to use/patch/advocate Perl everywhere. - 5. Either assimilate new technologies, or build bridges to them. +=item * Commit your change -=item Where is the implementation? +Committing your work will save the change I<on your local system>: -All the talk in the world is useless without an implementation. In -almost every case, the person or people who argue for a new feature -will be expected to be the ones who implement it. Porters capable -of coding new features have their own agendas, and are not available -to implement your (possibly good) idea. + % git commit -a -m 'Commit message goes here' -=item Backwards compatibility +Make sure the commit message describes your change in a single +sentence. For example, "Fixed spelling errors in perlhack.pod". -It's a cardinal sin to break existing Perl programs. New warnings are -contentious--some say that a program that emits warnings is not -broken, while others say it is. Adding keywords has the potential to -break programs, changing the meaning of existing token sequences or -functions might break programs. +=item * Send your change to perlbug -=item Could it be a module instead? +The next step is to submit your patch to the Perl core ticket system +via email. -Perl 5 has extension mechanisms, modules and XS, specifically to avoid -the need to keep changing the Perl interpreter. You can write modules -that export functions, you can give those functions prototypes so they -can be called like built-in functions, you can even write XS code to -mess with the runtime data structures of the Perl interpreter if you -want to implement really complicated things. If it can be done in a -module instead of in the core, it's highly unlikely to be added. +Assuming your patch consists of a single git commit, the following +writes the file as a MIME attachment, and sends it with a meaningful +subject: -=item Is the feature generic enough? + % git format-patch -1 --attach + % perlbug -s "[PATCH] $(git log -1 --oneline HEAD)" -f 0001-*.patch -Is this something that only the submitter wants added to the language, -or would it be broadly useful? Sometimes, instead of adding a feature -with a tight focus, the porters might decide to wait until someone -implements the more generalized feature. For instance, instead of -implementing a "delayed evaluation" feature, the porters are waiting -for a macro system that would permit delayed evaluation and much more. +The perlbug program will ask you a few questions about your email +address and the patch you're submitting. Once you've answered them it +will submit your patch via email. -=item Does it potentially introduce new bugs? +=item * Thank you -Radical rewrites of large chunks of the Perl interpreter have the -potential to introduce new bugs. The smaller and more localized the -change, the better. +The porters appreciate the time you spent helping to make Perl better. +Thank you! -=item Does it preclude other desirable features? +=back -A patch is likely to be rejected if it closes off future avenues of -development. For instance, a patch that placed a true and final -interpretation on prototypes is likely to be rejected because there -are still options for the future of prototypes that haven't been -addressed. +=head1 BUG REPORTING -=item Is the implementation robust? +If you want to report a bug in Perl, you must use the F<perlbug> +command line tool. This tool will ensure that your bug report includes +all the relevant system and configuration information. -Good patches (tight code, complete, correct) stand more chance of -going in. Sloppy or incorrect patches might be placed on the back -burner until the pumpking has time to fix, or might be discarded -altogether without further notice. +To browse existing Perl bugs and patches, you can use the web interface +at L<http://rt.perl.org/>. -=item Is the implementation generic enough to be portable? +Please check the archive of the perl5-porters list (see below) and/or +the bug tracking system before submitting a bug report. Often, you'll +find that the bug has been reported already. -The worst patches make use of a system-specific features. It's highly -unlikely that non-portable additions to the Perl language will be -accepted. +You can log in to the bug tracking system and comment on existing bug +reports. If you have additional information regarding an existing bug, +please add it. This will help the porters fix the bug. -=item Is the implementation tested? +=head1 PERL 5 PORTERS -Patches which change behaviour (fixing bugs or introducing new features) -must include regression tests to verify that everything works as expected. -Without tests provided by the original author, how can anyone else changing -perl in the future be sure that they haven't unwittingly broken the behaviour -the patch implements? And without tests, how can the patch's author be -confident that his/her hard work put into the patch won't be accidentally -thrown away by someone in the future? +The perl5-porters (p5p) mailing list is where the Perl standard +distribution is maintained and developed. The people who maintain Perl +are also referred to as the "Perl 5 Porters", "p5p" or just the +"porters". -=item Is there enough documentation? +A searchable archive of the list is available at +L<http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/>. There is +also another archive at +L<http://archive.develooper.com/perl5-porters@perl.org/>. -Patches without documentation are probably ill-thought out or -incomplete. Nothing can be added without documentation, so submitting -a patch for the appropriate manpages as well as the source code is -always a good idea. +=head2 perl-changes mailing list -=item Is there another way to do it? +The perl5-changes mailing list receives a copy of each patch that gets +submitted to the maintenance and development branches of the perl +repository. See L<http://lists.perl.org/list/perl5-changes.html> for +subscription and archive information. -Larry said "Although the Perl Slogan is I<There's More Than One Way -to Do It>, I hesitate to make 10 ways to do something". This is a -tricky heuristic to navigate, though--one man's essential addition is -another man's pointless cruft. +=head2 #p5p on IRC -=item Does it create too much work? +Many porters are also active on the L<irc://irc.perl.org/#p5p> channel. +Feel free to join the channel and ask questions about hacking on the +Perl core. -Work for the pumpking, work for Perl programmers, work for module -authors, ... Perl is supposed to be easy. +=head1 GETTING THE PERL SOURCE -=item Patches speak louder than words +All of Perl's source code is kept centrally in a Git repository at +I<perl5.git.perl.org>. The repository contains many Perl revisions from +Perl 1 onwards and all the revisions from Perforce, the previous +version control system. -Working code is always preferred to pie-in-the-sky ideas. A patch to -add a feature stands a much higher chance of making it to the language -than does a random feature request, no matter how fervently argued the -request might be. This ties into "Will it be useful?", as the fact -that someone took the time to make the patch demonstrates a strong -desire for the feature. +For much more detail on using git with the Perl repository, please see +L<perlgit>. -=back +=head2 Read access via Git -If you're on the list, you might hear the word "core" bandied -around. It refers to the standard distribution. "Hacking on the -core" means you're changing the C source code to the Perl -interpreter. "A core module" is one that ships with Perl. +You will need a copy of Git for your computer. You can fetch a copy of +the repository using the git protocol: -=head2 Keeping in sync + % git clone git://perl5.git.perl.org/perl.git perl -The source code to the Perl interpreter, in its different versions, is -kept in a repository managed by the git revision control system. The -pumpkings and a few others have write access to the repository to check in -changes. +This clones the repository and makes a local copy in the F<perl> +directory. -How to clone and use the git perl repository is described in L<perlrepository>. +If you cannot use the git protocol for firewall reasons, you can also +clone via http, though this is much slower: -You can also choose to use rsync to get a copy of the current source tree -for the bleadperl branch and all maintenance branches: + % git clone http://perl5.git.perl.org/perl.git perl - $ rsync -avz rsync://perl5.git.perl.org/perl-current . - $ rsync -avz rsync://perl5.git.perl.org/perl-5.12.x . - $ rsync -avz rsync://perl5.git.perl.org/perl-5.10.x . - $ rsync -avz rsync://perl5.git.perl.org/perl-5.8.x . - $ rsync -avz rsync://perl5.git.perl.org/perl-5.6.x . - $ rsync -avz rsync://perl5.git.perl.org/perl-5.005xx . +=head2 Read access via the web -(Add the C<--delete> option to remove leftover files) +You may access the repository over the web. This allows you to browse +the tree, see recent commits, subscribe to RSS feeds for the changes, +search for particular commits and more. You may access it at +L<http://perl5.git.perl.org/perl.git>. A mirror of the repository is +found at L<http://github.com/mirrors/perl>. -To get a full list of the available sync points: +=head2 Read access via rsync - $ rsync perl5.git.perl.org:: +You can also choose to use rsync to get a copy of the current source +tree for the bleadperl branch and all maintenance branches: -You may also want to subscribe to the perl5-changes mailing list to -receive a copy of each patch that gets submitted to the maintenance -and development "branches" of the perl repository. See -http://lists.perl.org/ for subscription information. + % rsync -avz rsync://perl5.git.perl.org/perl-current . + % rsync -avz rsync://perl5.git.perl.org/perl-5.12.x . + % rsync -avz rsync://perl5.git.perl.org/perl-5.10.x . + % rsync -avz rsync://perl5.git.perl.org/perl-5.8.x . + % rsync -avz rsync://perl5.git.perl.org/perl-5.6.x . + % rsync -avz rsync://perl5.git.perl.org/perl-5.005xx . -If you are a member of the perl5-porters mailing list, it is a good -thing to keep in touch with the most recent changes. If not only to -verify if what you would have posted as a bug report isn't already -solved in the most recent available perl development branch, also -known as perl-current, bleading edge perl, bleedperl or bleadperl. +(Add the C<--delete> option to remove leftover files.) -Needless to say, the source code in perl-current is usually in a perpetual -state of evolution. You should expect it to be very buggy. Do B<not> use -it for any purpose other than testing and development. +To get a full list of the available sync points: -=head2 Perlbug administration + % rsync perl5.git.perl.org:: -There is a single remote administrative interface for modifying bug status, -category, open issues etc. using the B<RT> bugtracker system, maintained -by Robert Spier. Become an administrator, and close any bugs you can get -your sticky mitts on: +=head2 Write access via git - http://bugs.perl.org/ +If you have a commit bit, please see L<perlgit> for more details on +using git. -To email the bug system administrators: +=head1 PATCHING PERL - "perlbug-admin" <perlbug-admin@perl.org> +If you're planning to do more extensive work than a single small fix, +we encourage you to read the documentation below. This will help you +focus your work and make your patches easier to incorporate into the +Perl source. =head2 Submitting patches -Always submit patches to I<perl5-porters@perl.org>. If you're -patching a core module and there's an author listed, send the author a -copy (see L<Patching a core module>). This lets other porters review -your patch, which catches a surprising number of errors in patches. -Please patch against the latest B<development> version. (e.g., even if -you're fixing a bug in the 5.8 track, patch against the C<blead> branch in -the git repository.) - -If changes are accepted, they are applied to the development branch. Then -the maintenance pumpking decides which of those patches is to be -backported to the maint branch. Only patches that survive the heat of the -development branch get applied to maintenance versions. - -Your patch should update the documentation and test suite. See -L<Writing a test>. If you have added or removed files in the distribution, -edit the MANIFEST file accordingly, sort the MANIFEST file using -C<make manisort>, and include those changes as part of your patch. - -Patching documentation also follows the same order: if accepted, a patch -is first applied to B<development>, and if relevant then it's backported -to B<maintenance>. (With an exception for some patches that document -behaviour that only appears in the maintenance branch, but which has -changed in the development version.) - -To report a bug in Perl, use the program I<perlbug> which comes with -Perl (if you can't get Perl to work, send mail to the address -I<perlbug@perl.org> or I<perlbug@perl.com>). Reporting bugs through -I<perlbug> feeds into the automated bug-tracking system, access to -which is provided through the web at http://rt.perl.org/rt3/ . It -often pays to check the archives of the perl5-porters mailing list to -see whether the bug you're reporting has been reported before, and if -so whether it was considered a bug. See above for the location of -the searchable archives. - -The CPAN testers ( http://testers.cpan.org/ ) are a group of -volunteers who test CPAN modules on a variety of platforms. Perl -Smokers ( http://www.nntp.perl.org/group/perl.daily-build and -http://www.nntp.perl.org/group/perl.daily-build.reports/ ) -automatically test Perl source releases on platforms with various -configurations. Both efforts welcome volunteers. In order to get -involved in smoke testing of the perl itself visit -L<http://search.cpan.org/dist/Test-Smoke>. In order to start smoke -testing CPAN modules visit L<http://search.cpan.org/dist/CPANPLUS-YACSmoke/> -or L<http://search.cpan.org/dist/minismokebox/> or -L<http://search.cpan.org/dist/CPAN-Reporter/>. +If you have a small patch to submit, please submit it via perlbug. You +can also send email directly to perlbug@perl.org. Please note that +messages sent to perlbug may be held in a moderation queue, so you +won't receive a response immediately. -It's a good idea to read and lurk for a while before chipping in. -That way you'll get to see the dynamic of the conversations, learn the -personalities of the players, and hopefully be better prepared to make -a useful contribution when do you speak up. +You'll know your submission has been processed when you receive an +email from our ticket tracking system. This email will give you a +ticket number. Once your patch has made it to the ticket tracking +system, it will also be sent to the perl5-porters@perl.org list. -If after all this you still think you want to join the perl5-porters -mailing list, send mail to I<perl5-porters-subscribe@perl.org>. To -unsubscribe, send mail to I<perl5-porters-unsubscribe@perl.org>. +Patches are reviewed and discussed on the p5p list. Simple, +uncontroversial patches will usually be applied without any discussion. +When the patch is applied, the ticket will be updated and you will +receive email. In addition, an email will be sent to the p5p list. -To hack on the Perl guts, you'll need to read the following things: +In other cases, the patch will need more work or discussion. That will +happen on the p5p list. -=over 3 +You are encouraged to participate in the discussion and advocate for +your patch. Sometimes your patch may get lost in the shuffle. It's +appropriate to send a reminder email to p5p if no action has been taken +in a month. Please remember that the Perl 5 developers are all +volunteers, and be polite. -=item L<perlguts> +Changes are always applied directly to the main development branch, +called "blead". Some patches may be backported to a maintenance branch. +If you think your patch is appropriate for the maintenance branch, +please explain why when you submit it. -This is of paramount importance, since it's the documentation of what -goes where in the Perl source. Read it over a couple of times and it -might start to make sense - don't worry if it doesn't yet, because the -best way to study it is to read it in conjunction with poking at Perl -source, and we'll do that later on. - -Gisle Aas's "illustrated perlguts", also known as I<illguts>, has very -helpful pictures: - -L<http://search.cpan.org/dist/illguts/> - -=item L<perlxstut> and L<perlxs> - -A working knowledge of XSUB programming is incredibly useful for core -hacking; XSUBs use techniques drawn from the PP code, the portion of the -guts that actually executes a Perl program. It's a lot gentler to learn -those techniques from simple examples and explanation than from the core -itself. - -=item L<perlapi> - -The documentation for the Perl API explains what some of the internal -functions do, as well as the many macros used in the source. - -=item F<Porting/pumpkin.pod> - -This is a collection of words of wisdom for a Perl porter; some of it is -only useful to the pumpkin holder, but most of it applies to anyone -wanting to go about Perl development. - -=item The perl5-porters FAQ +=head2 Getting your patch accepted -This should be available from http://dev.perl.org/perl5/docs/p5p-faq.html . -It contains hints on reading perl5-porters, information on how -perl5-porters works and how Perl development in general works. +If you are submitting a code patch there are several things that you +can do to help the Perl 5 Porters accept your patch. -=back +=head3 Patch style -=head2 Finding Your Way Around +If you used git to check out the Perl source, then using C<git +format-patch> will produce a patch in a style suitable for Perl. The +C<format-patch> command produces one patch file for each commit you +made. If you prefer to send a single patch for all commits, you can use +C<git diff>. -Perl maintenance can be split into a number of areas, and certain people -(pumpkins) will have responsibility for each area. These areas sometimes -correspond to files or directories in the source kit. Among the areas are: + % git checkout blead + % git pull + % git diff blead my-branch-name -=over 3 +This produces a patch based on the difference between blead and your +current branch. It's important to make sure that blead is up to date +before producing the diff, that's why we call C<git pull> first. -=item Core modules +We strongly recommend that you use git if possible. It will make your +life easier, and ours as well. -Modules shipped as part of the Perl core live in various subdirectories, where -two are dedicated to core-only modules, and two are for the dual-life modules -which live on CPAN and may be maintained separately with respect to the Perl -core: +However, if you're not using git, you can still produce a suitable +patch. You'll need a pristine copy of the Perl source to diff against. +The porters prefer unified diffs. Using GNU C<diff>, you can produce a +diff like this: - lib/ is for pure-Perl modules, which exist in the core only. + % diff -Npurd perl.pristine perl.mine - ext/ is for XS extensions, and modules with special Makefile.PL requirements, which exist in the core only. +Make sure that you C<make realclean> in your copy of Perl to remove any +build artifacts, or you may get a confusing result. - cpan/ is for dual-life modules, where the CPAN module is canonical (should be patched first). +=head3 Commit message - dist/ is for dual-life modules, where the blead source is canonical. +As you craft each patch you intend to submit to the Perl core, it's +important to write a good commit message. This is especially important +if your submission will consist of a series of commits. -For some dual-life modules it has not been discussed if the CPAN version or the -blead source is canonical. Until that is done, those modules should be in -F<cpan/>. +The first line of the commit message should be a short description +without a period. It should be no longer than the subject line of an +email, 50 characters being a good rule of thumb. -=item Tests +A lot of Git tools (Gitweb, GitHub, git log --pretty=oneline, ...) will +only display the first line (cut off at 50 characters) when presenting +commit summaries. -There are tests for nearly all the modules, built-ins and major bits -of functionality. Test files all have a .t suffix. Module tests live -in the F<lib/> and F<ext/> directories next to the module being -tested. Others live in F<t/>. See L<Writing a test> +The commit message should include a description of the problem that the +patch corrects or new functionality that the patch adds. -=item Documentation +As a general rule of thumb, your commit message should help a +programmer who knows the Perl core quickly understand what you were +trying to do, how you were trying to do it, and why the change matters +to Perl. -Documentation maintenance includes looking after everything in the -F<pod/> directory, (as well as contributing new documentation) and -the documentation to the modules in core. +=over 4 -=item Configure +=item * Why -The Configure process is the way we make Perl portable across the -myriad of operating systems it supports. Responsibility for the -Configure, build and installation process, as well as the overall -portability of the core code rests with the Configure pumpkin - -others help out with individual operating systems. +Your commit message should describe why the change you are making is +important. When someone looks at your change in six months or six +years, your intent should be clear. -The three files that fall under his/her responsibility are Configure, -config_h.SH, and Porting/Glossary (and a whole bunch of small related -files that are less important here). The Configure pumpkin decides how -patches to these are dealt with. Currently, the Configure pumpkin will -accept patches in most common formats, even directly to these files. -Other committers are allowed to commit to these files under the strict -condition that they will inform the Configure pumpkin, either on IRC -(if he/she happens to be around) or through (personal) e-mail. +If you're deprecating a feature with the intent of later simplifying +another bit of code, say so. If you're fixing a performance problem or +adding a new feature to support some other bit of the core, mention +that. -The files involved are the operating system directories, (F<win32/>, -F<os2/>, F<vms/> and so on) the shell scripts which generate F<config.h> -and F<Makefile>, as well as the metaconfig files which generate -F<Configure>. (metaconfig isn't included in the core distribution.) +=item * What -See http://perl5.git.perl.org/metaconfig.git/blob/HEAD:/README for a -description of the full process involved. +Your commit message should describe what part of the Perl core you're +changing and what you expect your patch to do. -=item Interpreter +=item * How -And of course, there's the core of the Perl interpreter itself. Let's -have a look at that in a little more detail. +While it's not necessary for documentation changes, new tests or +trivial patches, it's often worth explaining how your change works. +Even if it's clear to you today, it may not be clear to a porter next +month or next year. =back -Before we leave looking at the layout, though, don't forget that -F<MANIFEST> contains not only the file names in the Perl distribution, -but short descriptions of what's in them, too. For an overview of the -important files, try this: - - perl -lne 'print if /^[^\/]+\.[ch]\s+/' MANIFEST +A commit message isn't intended to take the place of comments in your +code. Commit messages should describe the change you made, while code +comments should describe the current state of the code. -=head2 Elements of the interpreter +If you've just implemented a new feature, complete with doc, tests and +well-commented code, a brief commit message will often suffice. If, +however, you've just changed a single character deep in the parser or +lexer, you might need to write a small novel to ensure that future +readers understand what you did and why you did it. -The work of the interpreter has two main stages: compiling the code -into the internal representation, or bytecode, and then executing it. -L<perlguts/Compiled code> explains exactly how the compilation stage -happens. +=head3 Comments, Comments, Comments -Here is a short breakdown of perl's operation: - -=over 3 +Be sure to adequately comment your code. While commenting every line is +unnecessary, anything that takes advantage of side effects of +operators, that creates changes that will be felt outside of the +function being patched, or that others may find confusing should be +documented. If you are going to err, it is better to err on the side of +adding too many comments than too few. -=item Startup +The best comments explain I<why> the code does what it does, not I<what +it does>. -The action begins in F<perlmain.c>. (or F<miniperlmain.c> for miniperl) -This is very high-level code, enough to fit on a single screen, and it -resembles the code found in L<perlembed>; most of the real action takes -place in F<perl.c> +=head3 Style -F<perlmain.c> is generated by L<writemain> from F<miniperlmain.c> at -make time, so you should make perl to follow this along. +In general, please follow the particular style of the code you are +patching. -First, F<perlmain.c> allocates some memory and constructs a Perl -interpreter, along these lines: - - 1 PERL_SYS_INIT3(&argc,&argv,&env); - 2 - 3 if (!PL_do_undump) { - 4 my_perl = perl_alloc(); - 5 if (!my_perl) - 6 exit(1); - 7 perl_construct(my_perl); - 8 PL_perl_destruct_level = 0; - 9 } - -Line 1 is a macro, and its definition is dependent on your operating -system. Line 3 references C<PL_do_undump>, a global variable - all -global variables in Perl start with C<PL_>. This tells you whether the -current running program was created with the C<-u> flag to perl and then -F<undump>, which means it's going to be false in any sane context. - -Line 4 calls a function in F<perl.c> to allocate memory for a Perl -interpreter. It's quite a simple function, and the guts of it looks like -this: - - my_perl = (PerlInterpreter*)PerlMem_malloc(sizeof(PerlInterpreter)); - -Here you see an example of Perl's system abstraction, which we'll see -later: C<PerlMem_malloc> is either your system's C<malloc>, or Perl's -own C<malloc> as defined in F<malloc.c> if you selected that option at -configure time. - -Next, in line 7, we construct the interpreter using perl_construct, -also in F<perl.c>; this sets up all the special variables that Perl -needs, the stacks, and so on. - -Now we pass Perl the command line options, and tell it to go: - - exitstatus = perl_parse(my_perl, xs_init, argc, argv, (char **)NULL); - if (!exitstatus) - perl_run(my_perl); - - exitstatus = perl_destruct(my_perl); - - perl_free(my_perl); - -C<perl_parse> is actually a wrapper around C<S_parse_body>, as defined -in F<perl.c>, which processes the command line options, sets up any -statically linked XS modules, opens the program and calls C<yyparse> to -parse it. - -=item Parsing - -The aim of this stage is to take the Perl source, and turn it into an op -tree. We'll see what one of those looks like later. Strictly speaking, -there's three things going on here. - -C<yyparse>, the parser, lives in F<perly.c>, although you're better off -reading the original YACC input in F<perly.y>. (Yes, Virginia, there -B<is> a YACC grammar for Perl!) The job of the parser is to take your -code and "understand" it, splitting it into sentences, deciding which -operands go with which operators and so on. - -The parser is nobly assisted by the lexer, which chunks up your input -into tokens, and decides what type of thing each token is: a variable -name, an operator, a bareword, a subroutine, a core function, and so on. -The main point of entry to the lexer is C<yylex>, and that and its -associated routines can be found in F<toke.c>. Perl isn't much like -other computer languages; it's highly context sensitive at times, it can -be tricky to work out what sort of token something is, or where a token -ends. As such, there's a lot of interplay between the tokeniser and the -parser, which can get pretty frightening if you're not used to it. - -As the parser understands a Perl program, it builds up a tree of -operations for the interpreter to perform during execution. The routines -which construct and link together the various operations are to be found -in F<op.c>, and will be examined later. - -=item Optimization - -Now the parsing stage is complete, and the finished tree represents -the operations that the Perl interpreter needs to perform to execute our -program. Next, Perl does a dry run over the tree looking for -optimisations: constant expressions such as C<3 + 4> will be computed -now, and the optimizer will also see if any multiple operations can be -replaced with a single one. For instance, to fetch the variable C<$foo>, -instead of grabbing the glob C<*foo> and looking at the scalar -component, the optimizer fiddles the op tree to use a function which -directly looks up the scalar in question. The main optimizer is C<peep> -in F<op.c>, and many ops have their own optimizing functions. - -=item Running - -Now we're finally ready to go: we have compiled Perl byte code, and all -that's left to do is run it. The actual execution is done by the -C<runops_standard> function in F<run.c>; more specifically, it's done by -these three innocent looking lines: - - while ((PL_op = CALL_FPTR(PL_op->op_ppaddr)(aTHX))) { - PERL_ASYNC_CHECK(); - } - -You may be more comfortable with the Perl version of that: - - PERL_ASYNC_CHECK() while $Perl::op = &{$Perl::op->{function}}; - -Well, maybe not. Anyway, each op contains a function pointer, which -stipulates the function which will actually carry out the operation. -This function will return the next op in the sequence - this allows for -things like C<if> which choose the next op dynamically at run time. -The C<PERL_ASYNC_CHECK> makes sure that things like signals interrupt -execution if required. - -The actual functions called are known as PP code, and they're spread -between four files: F<pp_hot.c> contains the "hot" code, which is most -often used and highly optimized, F<pp_sys.c> contains all the -system-specific functions, F<pp_ctl.c> contains the functions which -implement control structures (C<if>, C<while> and the like) and F<pp.c> -contains everything else. These are, if you like, the C code for Perl's -built-in functions and operators. - -Note that each C<pp_> function is expected to return a pointer to the next -op. Calls to perl subs (and eval blocks) are handled within the same -runops loop, and do not consume extra space on the C stack. For example, -C<pp_entersub> and C<pp_entertry> just push a C<CxSUB> or C<CxEVAL> block -struct onto the context stack which contain the address of the op -following the sub call or eval. They then return the first op of that sub -or eval block, and so execution continues of that sub or block. Later, a -C<pp_leavesub> or C<pp_leavetry> op pops the C<CxSUB> or C<CxEVAL>, -retrieves the return op from it, and returns it. - -=item Exception handing - -Perl's exception handing (i.e. C<die> etc.) is built on top of the low-level -C<setjmp()>/C<longjmp()> C-library functions. These basically provide a -way to capture the current PC and SP registers and later restore them; i.e. -a C<longjmp()> continues at the point in code where a previous C<setjmp()> -was done, with anything further up on the C stack being lost. This is why -code should always save values using C<SAVE_FOO> rather than in auto -variables. - -The perl core wraps C<setjmp()> etc in the macros C<JMPENV_PUSH> and -C<JMPENV_JUMP>. The basic rule of perl exceptions is that C<exit>, and -C<die> (in the absence of C<eval>) perform a C<JMPENV_JUMP(2)>, while -C<die> within C<eval> does a C<JMPENV_JUMP(3)>. - -At entry points to perl, such as C<perl_parse()>, C<perl_run()> and -C<call_sv(cv, G_EVAL)> each does a C<JMPENV_PUSH>, then enter a runops -loop or whatever, and handle possible exception returns. For a 2 return, -final cleanup is performed, such as popping stacks and calling C<CHECK> or -C<END> blocks. Amongst other things, this is how scope cleanup still -occurs during an C<exit>. - -If a C<die> can find a C<CxEVAL> block on the context stack, then the -stack is popped to that level and the return op in that block is assigned -to C<PL_restartop>; then a C<JMPENV_JUMP(3)> is performed. This normally -passes control back to the guard. In the case of C<perl_run> and -C<call_sv>, a non-null C<PL_restartop> triggers re-entry to the runops -loop. The is the normal way that C<die> or C<croak> is handled within an -C<eval>. - -Sometimes ops are executed within an inner runops loop, such as tie, sort -or overload code. In this case, something like - - sub FETCH { eval { die } } - -would cause a longjmp right back to the guard in C<perl_run>, popping both -runops loops, which is clearly incorrect. One way to avoid this is for the -tie code to do a C<JMPENV_PUSH> before executing C<FETCH> in the inner -runops loop, but for efficiency reasons, perl in fact just sets a flag, -using C<CATCH_SET(TRUE)>. The C<pp_require>, C<pp_entereval> and -C<pp_entertry> ops check this flag, and if true, they call C<docatch>, -which does a C<JMPENV_PUSH> and starts a new runops level to execute the -code, rather than doing it on the current loop. - -As a further optimisation, on exit from the eval block in the C<FETCH>, -execution of the code following the block is still carried on in the inner -loop. When an exception is raised, C<docatch> compares the C<JMPENV> -level of the C<CxEVAL> with C<PL_top_env> and if they differ, just -re-throws the exception. In this way any inner loops get popped. - -Here's an example. - - 1: eval { tie @a, 'A' }; - 2: sub A::TIEARRAY { - 3: eval { die }; - 4: die; - 5: } - -To run this code, C<perl_run> is called, which does a C<JMPENV_PUSH> then -enters a runops loop. This loop executes the eval and tie ops on line 1, -with the eval pushing a C<CxEVAL> onto the context stack. - -The C<pp_tie> does a C<CATCH_SET(TRUE)>, then starts a second runops loop -to execute the body of C<TIEARRAY>. When it executes the entertry op on -line 3, C<CATCH_GET> is true, so C<pp_entertry> calls C<docatch> which -does a C<JMPENV_PUSH> and starts a third runops loop, which then executes -the die op. At this point the C call stack looks like this: - - Perl_pp_die - Perl_runops # third loop - S_docatch_body - S_docatch - Perl_pp_entertry - Perl_runops # second loop - S_call_body - Perl_call_sv - Perl_pp_tie - Perl_runops # first loop - S_run_body - perl_run - main - -and the context and data stacks, as shown by C<-Dstv>, look like: - - STACK 0: MAIN - CX 0: BLOCK => - CX 1: EVAL => AV() PV("A"\0) - retop=leave - STACK 1: MAGIC - CX 0: SUB => - retop=(null) - CX 1: EVAL => * - retop=nextstate - -The die pops the first C<CxEVAL> off the context stack, sets -C<PL_restartop> from it, does a C<JMPENV_JUMP(3)>, and control returns to -the top C<docatch>. This then starts another third-level runops level, -which executes the nextstate, pushmark and die ops on line 4. At the point -that the second C<pp_die> is called, the C call stack looks exactly like -that above, even though we are no longer within an inner eval; this is -because of the optimization mentioned earlier. However, the context stack -now looks like this, ie with the top CxEVAL popped: - - STACK 0: MAIN - CX 0: BLOCK => - CX 1: EVAL => AV() PV("A"\0) - retop=leave - STACK 1: MAGIC - CX 0: SUB => - retop=(null) - -The die on line 4 pops the context stack back down to the CxEVAL, leaving -it as: - - STACK 0: MAIN - CX 0: BLOCK => - -As usual, C<PL_restartop> is extracted from the C<CxEVAL>, and a -C<JMPENV_JUMP(3)> done, which pops the C stack back to the docatch: - - S_docatch - Perl_pp_entertry - Perl_runops # second loop - S_call_body - Perl_call_sv - Perl_pp_tie - Perl_runops # first loop - S_run_body - perl_run - main - -In this case, because the C<JMPENV> level recorded in the C<CxEVAL> -differs from the current one, C<docatch> just does a C<JMPENV_JUMP(3)> -and the C stack unwinds to: - - perl_run - main - -Because C<PL_restartop> is non-null, C<run_body> starts a new runops loop -and execution continues. - -=back +In particular, follow these general guidelines for patching Perl +sources: -=head2 Internal Variable Types - -You should by now have had a look at L<perlguts>, which tells you about -Perl's internal variable types: SVs, HVs, AVs and the rest. If not, do -that now. - -These variables are used not only to represent Perl-space variables, but -also any constants in the code, as well as some structures completely -internal to Perl. The symbol table, for instance, is an ordinary Perl -hash. Your code is represented by an SV as it's read into the parser; -any program files you call are opened via ordinary Perl filehandles, and -so on. - -The core L<Devel::Peek|Devel::Peek> module lets us examine SVs from a -Perl program. Let's see, for instance, how Perl treats the constant -C<"hello">. - - % perl -MDevel::Peek -e 'Dump("hello")' - 1 SV = PV(0xa041450) at 0xa04ecbc - 2 REFCNT = 1 - 3 FLAGS = (POK,READONLY,pPOK) - 4 PV = 0xa0484e0 "hello"\0 - 5 CUR = 5 - 6 LEN = 6 - -Reading C<Devel::Peek> output takes a bit of practise, so let's go -through it line by line. - -Line 1 tells us we're looking at an SV which lives at C<0xa04ecbc> in -memory. SVs themselves are very simple structures, but they contain a -pointer to a more complex structure. In this case, it's a PV, a -structure which holds a string value, at location C<0xa041450>. Line 2 -is the reference count; there are no other references to this data, so -it's 1. - -Line 3 are the flags for this SV - it's OK to use it as a PV, it's a -read-only SV (because it's a constant) and the data is a PV internally. -Next we've got the contents of the string, starting at location -C<0xa0484e0>. - -Line 5 gives us the current length of the string - note that this does -B<not> include the null terminator. Line 6 is not the length of the -string, but the length of the currently allocated buffer; as the string -grows, Perl automatically extends the available storage via a routine -called C<SvGROW>. - -You can get at any of these quantities from C very easily; just add -C<Sv> to the name of the field shown in the snippet, and you've got a -macro which will return the value: C<SvCUR(sv)> returns the current -length of the string, C<SvREFCOUNT(sv)> returns the reference count, -C<SvPV(sv, len)> returns the string itself with its length, and so on. -More macros to manipulate these properties can be found in L<perlguts>. - -Let's take an example of manipulating a PV, from C<sv_catpvn>, in F<sv.c> - - 1 void - 2 Perl_sv_catpvn(pTHX_ register SV *sv, register const char *ptr, register STRLEN len) - 3 { - 4 STRLEN tlen; - 5 char *junk; - - 6 junk = SvPV_force(sv, tlen); - 7 SvGROW(sv, tlen + len + 1); - 8 if (ptr == junk) - 9 ptr = SvPVX(sv); - 10 Move(ptr,SvPVX(sv)+tlen,len,char); - 11 SvCUR(sv) += len; - 12 *SvEND(sv) = '\0'; - 13 (void)SvPOK_only_UTF8(sv); /* validate pointer */ - 14 SvTAINT(sv); - 15 } - -This is a function which adds a string, C<ptr>, of length C<len> onto -the end of the PV stored in C<sv>. The first thing we do in line 6 is -make sure that the SV B<has> a valid PV, by calling the C<SvPV_force> -macro to force a PV. As a side effect, C<tlen> gets set to the current -value of the PV, and the PV itself is returned to C<junk>. - -In line 7, we make sure that the SV will have enough room to accommodate -the old string, the new string and the null terminator. If C<LEN> isn't -big enough, C<SvGROW> will reallocate space for us. - -Now, if C<junk> is the same as the string we're trying to add, we can -grab the string directly from the SV; C<SvPVX> is the address of the PV -in the SV. - -Line 10 does the actual catenation: the C<Move> macro moves a chunk of -memory around: we move the string C<ptr> to the end of the PV - that's -the start of the PV plus its current length. We're moving C<len> bytes -of type C<char>. After doing so, we need to tell Perl we've extended the -string, by altering C<CUR> to reflect the new length. C<SvEND> is a -macro which gives us the end of the string, so that needs to be a -C<"\0">. - -Line 13 manipulates the flags; since we've changed the PV, any IV or NV -values will no longer be valid: if we have C<$a=10; $a.="6";> we don't -want to use the old IV of 10. C<SvPOK_only_utf8> is a special UTF-8-aware -version of C<SvPOK_only>, a macro which turns off the IOK and NOK flags -and turns on POK. The final C<SvTAINT> is a macro which launders tainted -data if taint mode is turned on. - -AVs and HVs are more complicated, but SVs are by far the most common -variable type being thrown around. Having seen something of how we -manipulate these, let's go on and look at how the op tree is -constructed. - -=head2 Op Trees - -First, what is the op tree, anyway? The op tree is the parsed -representation of your program, as we saw in our section on parsing, and -it's the sequence of operations that Perl goes through to execute your -program, as we saw in L</Running>. - -An op is a fundamental operation that Perl can perform: all the built-in -functions and operators are ops, and there are a series of ops which -deal with concepts the interpreter needs internally - entering and -leaving a block, ending a statement, fetching a variable, and so on. - -The op tree is connected in two ways: you can imagine that there are two -"routes" through it, two orders in which you can traverse the tree. -First, parse order reflects how the parser understood the code, and -secondly, execution order tells perl what order to perform the -operations in. - -The easiest way to examine the op tree is to stop Perl after it has -finished parsing, and get it to dump out the tree. This is exactly what -the compiler backends L<B::Terse|B::Terse>, L<B::Concise|B::Concise> -and L<B::Debug|B::Debug> do. - -Let's have a look at how Perl sees C<$a = $b + $c>: - - % perl -MO=Terse -e '$a=$b+$c' - 1 LISTOP (0x8179888) leave - 2 OP (0x81798b0) enter - 3 COP (0x8179850) nextstate - 4 BINOP (0x8179828) sassign - 5 BINOP (0x8179800) add [1] - 6 UNOP (0x81796e0) null [15] - 7 SVOP (0x80fafe0) gvsv GV (0x80fa4cc) *b - 8 UNOP (0x81797e0) null [15] - 9 SVOP (0x8179700) gvsv GV (0x80efeb0) *c - 10 UNOP (0x816b4f0) null [15] - 11 SVOP (0x816dcf0) gvsv GV (0x80fa460) *a - -Let's start in the middle, at line 4. This is a BINOP, a binary -operator, which is at location C<0x8179828>. The specific operator in -question is C<sassign> - scalar assignment - and you can find the code -which implements it in the function C<pp_sassign> in F<pp_hot.c>. As a -binary operator, it has two children: the add operator, providing the -result of C<$b+$c>, is uppermost on line 5, and the left hand side is on -line 10. - -Line 10 is the null op: this does exactly nothing. What is that doing -there? If you see the null op, it's a sign that something has been -optimized away after parsing. As we mentioned in L</Optimization>, -the optimization stage sometimes converts two operations into one, for -example when fetching a scalar variable. When this happens, instead of -rewriting the op tree and cleaning up the dangling pointers, it's easier -just to replace the redundant operation with the null op. Originally, -the tree would have looked like this: - - 10 SVOP (0x816b4f0) rv2sv [15] - 11 SVOP (0x816dcf0) gv GV (0x80fa460) *a - -That is, fetch the C<a> entry from the main symbol table, and then look -at the scalar component of it: C<gvsv> (C<pp_gvsv> into F<pp_hot.c>) -happens to do both these things. - -The right hand side, starting at line 5 is similar to what we've just -seen: we have the C<add> op (C<pp_add> also in F<pp_hot.c>) add together -two C<gvsv>s. - -Now, what's this about? - - 1 LISTOP (0x8179888) leave - 2 OP (0x81798b0) enter - 3 COP (0x8179850) nextstate - -C<enter> and C<leave> are scoping ops, and their job is to perform any -housekeeping every time you enter and leave a block: lexical variables -are tidied up, unreferenced variables are destroyed, and so on. Every -program will have those first three lines: C<leave> is a list, and its -children are all the statements in the block. Statements are delimited -by C<nextstate>, so a block is a collection of C<nextstate> ops, with -the ops to be performed for each statement being the children of -C<nextstate>. C<enter> is a single op which functions as a marker. - -That's how Perl parsed the program, from top to bottom: - - Program - | - Statement - | - = - / \ - / \ - $a + - / \ - $b $c - -However, it's impossible to B<perform> the operations in this order: -you have to find the values of C<$b> and C<$c> before you add them -together, for instance. So, the other thread that runs through the op -tree is the execution order: each op has a field C<op_next> which points -to the next op to be run, so following these pointers tells us how perl -executes the code. We can traverse the tree in this order using -the C<exec> option to C<B::Terse>: - - % perl -MO=Terse,exec -e '$a=$b+$c' - 1 OP (0x8179928) enter - 2 COP (0x81798c8) nextstate - 3 SVOP (0x81796c8) gvsv GV (0x80fa4d4) *b - 4 SVOP (0x8179798) gvsv GV (0x80efeb0) *c - 5 BINOP (0x8179878) add [1] - 6 SVOP (0x816dd38) gvsv GV (0x80fa468) *a - 7 BINOP (0x81798a0) sassign - 8 LISTOP (0x8179900) leave - -This probably makes more sense for a human: enter a block, start a -statement. Get the values of C<$b> and C<$c>, and add them together. -Find C<$a>, and assign one to the other. Then leave. - -The way Perl builds up these op trees in the parsing process can be -unravelled by examining F<perly.y>, the YACC grammar. Let's take the -piece we need to construct the tree for C<$a = $b + $c> - - 1 term : term ASSIGNOP term - 2 { $$ = newASSIGNOP(OPf_STACKED, $1, $2, $3); } - 3 | term ADDOP term - 4 { $$ = newBINOP($2, 0, scalar($1), scalar($3)); } - -If you're not used to reading BNF grammars, this is how it works: You're -fed certain things by the tokeniser, which generally end up in upper -case. Here, C<ADDOP>, is provided when the tokeniser sees C<+> in your -code. C<ASSIGNOP> is provided when C<=> is used for assigning. These are -"terminal symbols", because you can't get any simpler than them. - -The grammar, lines one and three of the snippet above, tells you how to -build up more complex forms. These complex forms, "non-terminal symbols" -are generally placed in lower case. C<term> here is a non-terminal -symbol, representing a single expression. - -The grammar gives you the following rule: you can make the thing on the -left of the colon if you see all the things on the right in sequence. -This is called a "reduction", and the aim of parsing is to completely -reduce the input. There are several different ways you can perform a -reduction, separated by vertical bars: so, C<term> followed by C<=> -followed by C<term> makes a C<term>, and C<term> followed by C<+> -followed by C<term> can also make a C<term>. - -So, if you see two terms with an C<=> or C<+>, between them, you can -turn them into a single expression. When you do this, you execute the -code in the block on the next line: if you see C<=>, you'll do the code -in line 2. If you see C<+>, you'll do the code in line 4. It's this code -which contributes to the op tree. - - | term ADDOP term - { $$ = newBINOP($2, 0, scalar($1), scalar($3)); } - -What this does is creates a new binary op, and feeds it a number of -variables. The variables refer to the tokens: C<$1> is the first token in -the input, C<$2> the second, and so on - think regular expression -backreferences. C<$$> is the op returned from this reduction. So, we -call C<newBINOP> to create a new binary operator. The first parameter to -C<newBINOP>, a function in F<op.c>, is the op type. It's an addition -operator, so we want the type to be C<ADDOP>. We could specify this -directly, but it's right there as the second token in the input, so we -use C<$2>. The second parameter is the op's flags: 0 means "nothing -special". Then the things to add: the left and right hand side of our -expression, in scalar context. - -=head2 Stacks - -When perl executes something like C<addop>, how does it pass on its -results to the next op? The answer is, through the use of stacks. Perl -has a number of stacks to store things it's currently working on, and -we'll look at the three most important ones here. - -=over 3 - -=item Argument stack - -Arguments are passed to PP code and returned from PP code using the -argument stack, C<ST>. The typical way to handle arguments is to pop -them off the stack, deal with them how you wish, and then push the result -back onto the stack. This is how, for instance, the cosine operator -works: - - NV value; - value = POPn; - value = Perl_cos(value); - XPUSHn(value); - -We'll see a more tricky example of this when we consider Perl's macros -below. C<POPn> gives you the NV (floating point value) of the top SV on -the stack: the C<$x> in C<cos($x)>. Then we compute the cosine, and push -the result back as an NV. The C<X> in C<XPUSHn> means that the stack -should be extended if necessary - it can't be necessary here, because we -know there's room for one more item on the stack, since we've just -removed one! The C<XPUSH*> macros at least guarantee safety. - -Alternatively, you can fiddle with the stack directly: C<SP> gives you -the first element in your portion of the stack, and C<TOP*> gives you -the top SV/IV/NV/etc. on the stack. So, for instance, to do unary -negation of an integer: - - SETi(-TOPi); - -Just set the integer value of the top stack entry to its negation. - -Argument stack manipulation in the core is exactly the same as it is in -XSUBs - see L<perlxstut>, L<perlxs> and L<perlguts> for a longer -description of the macros used in stack manipulation. - -=item Mark stack - -I say "your portion of the stack" above because PP code doesn't -necessarily get the whole stack to itself: if your function calls -another function, you'll only want to expose the arguments aimed for the -called function, and not (necessarily) let it get at your own data. The -way we do this is to have a "virtual" bottom-of-stack, exposed to each -function. The mark stack keeps bookmarks to locations in the argument -stack usable by each function. For instance, when dealing with a tied -variable, (internally, something with "P" magic) Perl has to call -methods for accesses to the tied variables. However, we need to separate -the arguments exposed to the method to the argument exposed to the -original function - the store or fetch or whatever it may be. Here's -roughly how the tied C<push> is implemented; see C<av_push> in F<av.c>: - - 1 PUSHMARK(SP); - 2 EXTEND(SP,2); - 3 PUSHs(SvTIED_obj((SV*)av, mg)); - 4 PUSHs(val); - 5 PUTBACK; - 6 ENTER; - 7 call_method("PUSH", G_SCALAR|G_DISCARD); - 8 LEAVE; - -Let's examine the whole implementation, for practice: - - 1 PUSHMARK(SP); - -Push the current state of the stack pointer onto the mark stack. This is -so that when we've finished adding items to the argument stack, Perl -knows how many things we've added recently. - - 2 EXTEND(SP,2); - 3 PUSHs(SvTIED_obj((SV*)av, mg)); - 4 PUSHs(val); - -We're going to add two more items onto the argument stack: when you have -a tied array, the C<PUSH> subroutine receives the object and the value -to be pushed, and that's exactly what we have here - the tied object, -retrieved with C<SvTIED_obj>, and the value, the SV C<val>. - - 5 PUTBACK; - -Next we tell Perl to update the global stack pointer from our internal -variable: C<dSP> only gave us a local copy, not a reference to the global. - - 6 ENTER; - 7 call_method("PUSH", G_SCALAR|G_DISCARD); - 8 LEAVE; - -C<ENTER> and C<LEAVE> localise a block of code - they make sure that all -variables are tidied up, everything that has been localised gets -its previous value returned, and so on. Think of them as the C<{> and -C<}> of a Perl block. - -To actually do the magic method call, we have to call a subroutine in -Perl space: C<call_method> takes care of that, and it's described in -L<perlcall>. We call the C<PUSH> method in scalar context, and we're -going to discard its return value. The call_method() function -removes the top element of the mark stack, so there is nothing for -the caller to clean up. - -=item Save stack - -C doesn't have a concept of local scope, so perl provides one. We've -seen that C<ENTER> and C<LEAVE> are used as scoping braces; the save -stack implements the C equivalent of, for example: - - { - local $foo = 42; - ... - } - -See L<perlguts/Localising Changes> for how to use the save stack. - -=back +=over 4 -=head2 Millions of Macros +=item * -One thing you'll notice about the Perl source is that it's full of -macros. Some have called the pervasive use of macros the hardest thing -to understand, others find it adds to clarity. Let's take an example, -the code which implements the addition operator: +8-wide tabs (no exceptions!) - 1 PP(pp_add) - 2 { - 3 dSP; dATARGET; tryAMAGICbin(add,opASSIGN); - 4 { - 5 dPOPTOPnnrl_ul; - 6 SETn( left + right ); - 7 RETURN; - 8 } - 9 } +=item * -Every line here (apart from the braces, of course) contains a macro. The -first line sets up the function declaration as Perl expects for PP code; -line 3 sets up variable declarations for the argument stack and the -target, the return value of the operation. Finally, it tries to see if -the addition operation is overloaded; if so, the appropriate subroutine -is called. +4-wide indents for code, 2-wide indents for nested CPP #defines -Line 5 is another variable declaration - all variable declarations start -with C<d> - which pops from the top of the argument stack two NVs (hence -C<nn>) and puts them into the variables C<right> and C<left>, hence the -C<rl>. These are the two operands to the addition operator. Next, we -call C<SETn> to set the NV of the return value to the result of adding -the two values. This done, we return - the C<RETURN> macro makes sure -that our return value is properly handled, and we pass the next operator -to run back to the main run loop. +=item * -Most of these macros are explained in L<perlapi>, and some of the more -important ones are explained in L<perlxs> as well. Pay special attention -to L<perlguts/Background and PERL_IMPLICIT_CONTEXT> for information on -the C<[pad]THX_?> macros. +Try hard not to exceed 79-columns -=head2 The .i Targets +=item * -You can expand the macros in a F<foo.c> file by saying +ANSI C prototypes - make foo.i +=item * -which will expand the macros using cpp. Don't be scared by the results. +Uncuddled elses and "K&R" style for indenting control constructs -=head1 SOURCE CODE STATIC ANALYSIS +=item * -Various tools exist for analysing C source code B<statically>, as -opposed to B<dynamically>, that is, without executing the code. -It is possible to detect resource leaks, undefined behaviour, type -mismatches, portability problems, code paths that would cause illegal -memory accesses, and other similar problems by just parsing the C code -and looking at the resulting graph, what does it tell about the -execution and data flows. As a matter of fact, this is exactly -how C compilers know to give warnings about dubious code. +No C++ style (//) comments -=head2 lint, splint +=item * -The good old C code quality inspector, C<lint>, is available in -several platforms, but please be aware that there are several -different implementations of it by different vendors, which means that -the flags are not identical across different platforms. +Mark places that need to be revisited with XXX (and revisit often!) -There is a lint variant called C<splint> (Secure Programming Lint) -available from http://www.splint.org/ that should compile on any -Unix-like platform. +=item * -There are C<lint> and <splint> targets in Makefile, but you may have -to diddle with the flags (see above). +Opening brace lines up with "if" when conditional spans multiple lines; +should be at end-of-line otherwise -=head2 Coverity +=item * -Coverity (http://www.coverity.com/) is a product similar to lint and -as a testbed for their product they periodically check several open -source projects, and they give out accounts to open source developers -to the defect databases. +In function definitions, name starts in column 0 (return value is on +previous line) -=head2 cpd (cut-and-paste detector) +=item * -The cpd tool detects cut-and-paste coding. If one instance of the -cut-and-pasted code changes, all the other spots should probably be -changed, too. Therefore such code should probably be turned into a -subroutine or a macro. +Single space after keywords that are followed by parens, no space +between function name and following paren -cpd (http://pmd.sourceforge.net/cpd.html) is part of the pmd project -(http://pmd.sourceforge.net/). pmd was originally written for static -analysis of Java code, but later the cpd part of it was extended to -parse also C and C++. +=item * -Download the pmd-bin-X.Y.zip () from the SourceForge site, extract the -pmd-X.Y.jar from it, and then run that on source code thusly: +Avoid assignments in conditionals, but if they're unavoidable, use +extra paren, e.g. "if (a && (b = c)) ..." - java -cp pmd-X.Y.jar net.sourceforge.pmd.cpd.CPD --minimum-tokens 100 --files /some/where/src --language c > cpd.txt +=item * -You may run into memory limits, in which case you should use the -Xmx option: +"return foo;" rather than "return(foo);" - java -Xmx512M ... +=item * -=head2 gcc warnings +"if (!foo) ..." rather than "if (foo == FALSE) ..." etc. -Though much can be written about the inconsistency and coverage -problems of gcc warnings (like C<-Wall> not meaning "all the -warnings", or some common portability problems not being covered by -C<-Wall>, or C<-ansi> and C<-pedantic> both being a poorly defined -collection of warnings, and so forth), gcc is still a useful tool in -keeping our coding nose clean. +=back -The C<-Wall> is by default on. +=head3 Test suite -The C<-ansi> (and its sidekick, C<-pedantic>) would be nice to be on -always, but unfortunately they are not safe on all platforms, they can -for example cause fatal conflicts with the system headers (Solaris -being a prime example). If Configure C<-Dgccansipedantic> is used, -the C<cflags> frontend selects C<-ansi -pedantic> for the platforms -where they are known to be safe. +If your patch changes code (rather than just changing documentation), +you should also include one or more test cases which illustrate the bug +you're fixing or validate the new functionality you're adding. In +general, you should update an existing test file rather than create a +new one. -Starting from Perl 5.9.4 the following extra flags are added: +Your test suite additions should generally follow these guidelines +(courtesy of Gurusamy Sarathy <gsar@activestate.com>): =over 4 =item * -C<-Wendif-labels> +Know what you're testing. Read the docs, and the source. =item * -C<-Wextra> +Tend to fail, not succeed. =item * -C<-Wdeclaration-after-statement> - -=back - -The following flags would be nice to have but they would first need -their own Augean stablemaster: - -=over 4 +Interpret results strictly. =item * -C<-Wpointer-arith> +Use unrelated features (this will flush out bizarre interactions). =item * -C<-Wshadow> +Use non-standard idioms (otherwise you are not testing TIMTOWTDI). =item * -C<-Wstrict-prototypes> - -=back - -The C<-Wtraditional> is another example of the annoying tendency of -gcc to bundle a lot of warnings under one switch (it would be -impossible to deploy in practice because it would complain a lot) but -it does contain some warnings that would be beneficial to have available -on their own, such as the warning about string constants inside macros -containing the macro arguments: this behaved differently pre-ANSI -than it does in ANSI, and some C compilers are still in transition, -AIX being an example. - -=head2 Warnings of other C compilers +Avoid using hardcoded test numbers whenever possible (the EXPECTED/GOT +found in t/op/tie.t is much more maintainable, and gives better failure +reports). -Other C compilers (yes, there B<are> other C compilers than gcc) often -have their "strict ANSI" or "strict ANSI with some portability extensions" -modes on, like for example the Sun Workshop has its C<-Xa> mode on -(though implicitly), or the DEC (these days, HP...) has its C<-std1> -mode on. +=item * -=head2 DEBUGGING +Give meaningful error messages when a test fails. -You can compile a special debugging version of Perl, which allows you -to use the C<-D> option of Perl to tell more about what Perl is doing. -But sometimes there is no alternative than to dive in with a debugger, -either to see the stack trace of a core dump (very useful in a bug -report), or trying to figure out what went wrong before the core dump -happened, or how did we end up having wrong or unexpected results. +=item * -=head2 Poking at Perl +Avoid using qx// and system() unless you are testing for them. If you +do use them, make sure that you cover _all_ perl platforms. -To really poke around with Perl, you'll probably want to build Perl for -debugging, like this: +=item * - ./Configure -d -D optimize=-g - make +Unlink any temporary files you create. -C<-g> is a flag to the C compiler to have it produce debugging -information which will allow us to step through a running program, -and to see in which C function we are at (without the debugging -information we might see only the numerical addresses of the functions, -which is not very helpful). +=item * -F<Configure> will also turn on the C<DEBUGGING> compilation symbol which -enables all the internal debugging code in Perl. There are a whole bunch -of things you can debug with this: L<perlrun> lists them all, and the -best way to find out about them is to play about with them. The most -useful options are probably +Promote unforeseen warnings to errors with $SIG{__WARN__}. - l Context (loop) stack processing - t Trace execution - o Method and overloading resolution - c String/numeric conversions +=item * -Some of the functionality of the debugging code can be achieved using XS -modules. +Be sure to use the libraries and modules shipped with the version being +tested, not those that were already installed. - -Dr => use re 'debug' - -Dx => use O 'Debug' +=item * -=head2 Using a source-level debugger +Add comments to the code explaining what you are testing for. -If the debugging output of C<-D> doesn't help you, it's time to step -through perl's execution with a source-level debugger. +=item * -=over 3 +Make updating the '1..42' string unnecessary. Or make sure that you +update it. =item * -We'll use C<gdb> for our examples here; the principles will apply to -any debugger (many vendors call their debugger C<dbx>), but check the -manual of the one you're using. +Test _all_ behaviors of a given operator, library, or function. -=back +Test all optional arguments. -To fire up the debugger, type +Test return values in various contexts (boolean, scalar, list, lvalue). - gdb ./perl +Use both global and lexical variables. -Or if you have a core dump: +Don't forget the exceptional, pathological cases. - gdb ./perl core +=back -You'll want to do that in your Perl source tree so the debugger can read -the source code. You should see the copyright message, followed by the -prompt. +=head2 Patching a core module - (gdb) +This works just like patching anything else, with one extra +consideration. -C<help> will get you into the documentation, but here are the most -useful commands: +Modules in the F<cpan/> directory of the source tree are maintained +outside of the Perl core. When the author updates the module, the +updates are simply copied into the core. See that module's +documentation or its listing on L<http://search.cpan.org/> for more +information on reporting bugs and submitting patches. -=over 3 +In most cases, patches to modules in F<cpan/> should be sent upstream +and should not be applied to the Perl core individually. If a patch to +a file in F<cpan/> absolutely cannot wait for the fix to be made +upstream, released to CPAN and copied to blead, you must add (or +update) a C<CUSTOMIZED> entry in the F<"Porting/Maintainers.pl"> file +to flag that a local modification has been made. See +F<"Porting/Maintainers.pl"> for more details. -=item run [args] +In contrast, modules in the F<dist/> directory are maintained in the +core. -Run the program with the given arguments. +=head2 Updating perldelta -=item break function_name +For changes significant enough to warrant a F<pod/perldelta.pod> entry, +the porters will greatly appreciate it if you submit a delta entry +along with your actual change. Significant changes include, but are not +limited to: -=item break source.c:xxx +=over 4 -Tells the debugger that we'll want to pause execution when we reach -either the named function (but see L<perlguts/Internal Functions>!) or the given -line in the named source file. +=item * -=item step +Adding, deprecating, or removing core features -Steps through the program a line at a time. +=item * -=item next +Adding, deprecating, removing, or upgrading core or dual-life modules -Steps through the program a line at a time, without descending into -functions. +=item * -=item continue +Adding new core tests -Run until the next breakpoint. +=item * -=item finish +Fixing security issues and user-visible bugs in the core -Run until the end of the current function, then stop again. +=item * -=item 'enter' +Changes that might break existing code, either on the perl or C level -Just pressing Enter will do the most recent operation again - it's a -blessing when stepping through miles of source code. +=item * -=item print +Significant performance improvements -Execute the given C code and print its results. B<WARNING>: Perl makes -heavy use of macros, and F<gdb> does not necessarily support macros -(see later L</"gdb macro support">). You'll have to substitute them -yourself, or to invoke cpp on the source code files -(see L</"The .i Targets">) -So, for instance, you can't say +=item * - print SvPV_nolen(sv) +Adding, removing, or significantly changing documentation in the +F<pod/> directory -but you have to say +=item * - print Perl_sv_2pv_nolen(sv) +Important platform-specific changes =back -You may find it helpful to have a "macro dictionary", which you can -produce by saying C<cpp -dM perl.c | sort>. Even then, F<cpp> won't -recursively apply those macros for you. - -=head2 gdb macro support - -Recent versions of F<gdb> have fairly good macro support, but -in order to use it you'll need to compile perl with macro definitions -included in the debugging information. Using F<gcc> version 3.1, this -means configuring with C<-Doptimize=-g3>. Other compilers might use a -different switch (if they support debugging macros at all). - -=head2 Dumping Perl Data Structures - -One way to get around this macro hell is to use the dumping functions in -F<dump.c>; these work a little like an internal -L<Devel::Peek|Devel::Peek>, but they also cover OPs and other structures -that you can't get at from Perl. Let's take an example. We'll use the -C<$a = $b + $c> we used before, but give it a bit of context: -C<$b = "6XXXX"; $c = 2.3;>. Where's a good place to stop and poke around? - -What about C<pp_add>, the function we examined earlier to implement the -C<+> operator: - - (gdb) break Perl_pp_add - Breakpoint 1 at 0x46249f: file pp_hot.c, line 309. - -Notice we use C<Perl_pp_add> and not C<pp_add> - see L<perlguts/Internal Functions>. -With the breakpoint in place, we can run our program: - - (gdb) run -e '$b = "6XXXX"; $c = 2.3; $a = $b + $c' - -Lots of junk will go past as gdb reads in the relevant source files and -libraries, and then: - - Breakpoint 1, Perl_pp_add () at pp_hot.c:309 - 309 dSP; dATARGET; tryAMAGICbin(add,opASSIGN); - (gdb) step - 311 dPOPTOPnnrl_ul; - (gdb) - -We looked at this bit of code before, and we said that C<dPOPTOPnnrl_ul> -arranges for two C<NV>s to be placed into C<left> and C<right> - let's -slightly expand it: - - #define dPOPTOPnnrl_ul NV right = POPn; \ - SV *leftsv = TOPs; \ - NV left = USE_LEFT(leftsv) ? SvNV(leftsv) : 0.0 - -C<POPn> takes the SV from the top of the stack and obtains its NV either -directly (if C<SvNOK> is set) or by calling the C<sv_2nv> function. -C<TOPs> takes the next SV from the top of the stack - yes, C<POPn> uses -C<TOPs> - but doesn't remove it. We then use C<SvNV> to get the NV from -C<leftsv> in the same way as before - yes, C<POPn> uses C<SvNV>. - -Since we don't have an NV for C<$b>, we'll have to use C<sv_2nv> to -convert it. If we step again, we'll find ourselves there: - - Perl_sv_2nv (sv=0xa0675d0) at sv.c:1669 - 1669 if (!sv) - (gdb) - -We can now use C<Perl_sv_dump> to investigate the SV: - - SV = PV(0xa057cc0) at 0xa0675d0 - REFCNT = 1 - FLAGS = (POK,pPOK) - PV = 0xa06a510 "6XXXX"\0 - CUR = 5 - LEN = 6 - $1 = void - -We know we're going to get C<6> from this, so let's finish the -subroutine: - - (gdb) finish - Run till exit from #0 Perl_sv_2nv (sv=0xa0675d0) at sv.c:1671 - 0x462669 in Perl_pp_add () at pp_hot.c:311 - 311 dPOPTOPnnrl_ul; - -We can also dump out this op: the current op is always stored in -C<PL_op>, and we can dump it with C<Perl_op_dump>. This'll give us -similar output to L<B::Debug|B::Debug>. - - { - 13 TYPE = add ===> 14 - TARG = 1 - FLAGS = (SCALAR,KIDS) - { - TYPE = null ===> (12) - (was rv2sv) - FLAGS = (SCALAR,KIDS) - { - 11 TYPE = gvsv ===> 12 - FLAGS = (SCALAR) - GV = main::b - } - } - -# finish this later # - -=head2 Patching - -All right, we've now had a look at how to navigate the Perl sources and -some things you'll need to know when fiddling with them. Let's now get -on and create a simple patch. Here's something Larry suggested: if a -C<U> is the first active format during a C<pack>, (for example, -C<pack "U3C8", @stuff>) then the resulting string should be treated as -UTF-8 encoded. - -If you are working with a git clone of the Perl repository, you will want to -create a branch for your changes. This will make creating a proper patch much -simpler. See the L<perlrepository> for details on how to do this. - -How do we prepare to fix this up? First we locate the code in question - -the C<pack> happens at runtime, so it's going to be in one of the F<pp> -files. Sure enough, C<pp_pack> is in F<pp.c>. Since we're going to be -altering this file, let's copy it to F<pp.c~>. - -[Well, it was in F<pp.c> when this tutorial was written. It has now been -split off with C<pp_unpack> to its own file, F<pp_pack.c>] - -Now let's look over C<pp_pack>: we take a pattern into C<pat>, and then -loop over the pattern, taking each format character in turn into -C<datum_type>. Then for each possible format character, we swallow up -the other arguments in the pattern (a field width, an asterisk, and so -on) and convert the next chunk input into the specified format, adding -it onto the output SV C<cat>. - -How do we know if the C<U> is the first format in the C<pat>? Well, if -we have a pointer to the start of C<pat> then, if we see a C<U> we can -test whether we're still at the start of the string. So, here's where -C<pat> is set up: - - STRLEN fromlen; - register char *pat = SvPVx(*++MARK, fromlen); - register char *patend = pat + fromlen; - register I32 len; - I32 datumtype; - SV *fromstr; - -We'll have another string pointer in there: - - STRLEN fromlen; - register char *pat = SvPVx(*++MARK, fromlen); - register char *patend = pat + fromlen; - + char *patcopy; - register I32 len; - I32 datumtype; - SV *fromstr; - -And just before we start the loop, we'll set C<patcopy> to be the start -of C<pat>: - - items = SP - MARK; - MARK++; - sv_setpvn(cat, "", 0); - + patcopy = pat; - while (pat < patend) { - -Now if we see a C<U> which was at the start of the string, we turn on -the C<UTF8> flag for the output SV, C<cat>: - - + if (datumtype == 'U' && pat==patcopy+1) - + SvUTF8_on(cat); - if (datumtype == '#') { - while (pat < patend && *pat != '\n') - pat++; - -Remember that it has to be C<patcopy+1> because the first character of -the string is the C<U> which has been swallowed into C<datumtype!> - -Oops, we forgot one thing: what if there are spaces at the start of the -pattern? C<pack(" U*", @stuff)> will have C<U> as the first active -character, even though it's not the first thing in the pattern. In this -case, we have to advance C<patcopy> along with C<pat> when we see spaces: - - if (isSPACE(datumtype)) - continue; - -needs to become - - if (isSPACE(datumtype)) { - patcopy++; - continue; - } - -OK. That's the C part done. Now we must do two additional things before -this patch is ready to go: we've changed the behaviour of Perl, and so -we must document that change. We must also provide some more regression -tests to make sure our patch works and doesn't create a bug somewhere -else along the line. - -The regression tests for each operator live in F<t/op/>, and so we -make a copy of F<t/op/pack.t> to F<t/op/pack.t~>. Now we can add our -tests to the end. First, we'll test that the C<U> does indeed create -Unicode strings. - -t/op/pack.t has a sensible ok() function, but if it didn't we could -use the one from t/test.pl. - - require './test.pl'; - plan( tests => 159 ); - -so instead of this: +Please make sure you add the perldelta entry to the right section +within F<pod/perldelta.pod>. More information on how to write good +perldelta entries is available in the C<Style> section of +F<Porting/how_to_write_a_perldelta.pod>. - print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000); - print "ok $test\n"; $test++; +=head2 What makes for a good patch? -we can write the more sensible (see L<Test::More> for a full -explanation of is() and other testing functions). +New features and extensions to the language can be contentious. There +is no specific set of criteria which determine what features get added, +but here are some questions to consider when developing a patch: - is( "1.20.300.4000", sprintf "%vd", pack("U*",1,20,300,4000), - "U* produces Unicode" ); +=head3 Does the concept match the general goals of Perl? -Now we'll test that we got that space-at-the-beginning business right: +Our goals include, but are not limited to: - is( "1.20.300.4000", sprintf "%vd", pack(" U*",1,20,300,4000), - " with spaces at the beginning" ); - -And finally we'll test that we don't make Unicode strings if C<U> is B<not> -the first active format: - - isnt( v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000), - "U* not first isn't Unicode" ); - -Mustn't forget to change the number of tests which appears at the top, -or else the automated tester will get confused. This will either look -like this: - - print "1..156\n"; - -or this: +=over 4 - plan( tests => 156 ); +=item 1. -We now compile up Perl, and run it through the test suite. Our new -tests pass, hooray! +Keep it fast, simple, and useful. -Finally, the documentation. The job is never done until the paperwork is -over, so let's describe the change we've just made. The relevant place -is F<pod/perlfunc.pod>; again, we make a copy, and then we'll insert -this text in the description of C<pack>: +=item 2. - =item * +Keep features/concepts as orthogonal as possible. - If the pattern begins with a C<U>, the resulting string will be treated - as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a string - with an initial C<U0>, and the bytes that follow will be interpreted as - Unicode characters. If you don't want this to happen, you can begin your - pattern with C<C0> (or anything else) to force Perl not to UTF-8 encode your - string, and then follow this with a C<U*> somewhere in your pattern. +=item 3. -=head2 Patching a core module +No arbitrary limits (platforms, data sizes, cultures). -This works just like patching anything else, with an extra -consideration. Many core modules also live on CPAN. If this is so, -patch the CPAN version instead of the core and send the patch off to -the module maintainer (with a copy to p5p). This will help the module -maintainer keep the CPAN version in sync with the core version without -constantly scanning p5p. - -The list of maintainers of core modules is usefully documented in -F<Porting/Maintainers.pl>. - -=head2 Adding a new function to the core - -If, as part of a patch to fix a bug, or just because you have an -especially good idea, you decide to add a new function to the core, -discuss your ideas on p5p well before you start work. It may be that -someone else has already attempted to do what you are considering and -can give lots of good advice or even provide you with bits of code -that they already started (but never finished). - -You have to follow all of the advice given above for patching. It is -extremely important to test any addition thoroughly and add new tests -to explore all boundary conditions that your new function is expected -to handle. If your new function is used only by one module (e.g. toke), -then it should probably be named S_your_function (for static); on the -other hand, if you expect it to accessible from other functions in -Perl, you should name it Perl_your_function. See L<perlguts/Internal Functions> -for more details. +=item 4. -The location of any new code is also an important consideration. Don't -just create a new top level .c file and put your code there; you would -have to make changes to Configure (so the Makefile is created properly), -as well as possibly lots of include files. This is strictly pumpking -business. +Keep it open and exciting to use/patch/advocate Perl everywhere. -It is better to add your function to one of the existing top level -source code files, but your choice is complicated by the nature of -the Perl distribution. Only the files that are marked as compiled -static are located in the perl executable. Everything else is located -in the shared library (or DLL if you are running under WIN32). So, -for example, if a function was only used by functions located in -toke.c, then your code can go in toke.c. If, however, you want to call -the function from universal.c, then you should put your code in another -location, for example util.c. +=item 5. -In addition to writing your c-code, you will need to create an -appropriate entry in embed.pl describing your function, then run -'make regen_headers' to create the entries in the numerous header -files that perl needs to compile correctly. See L<perlguts/Internal Functions> -for information on the various options that you can set in embed.pl. -You will forget to do this a few (or many) times and you will get -warnings during the compilation phase. Make sure that you mention -this when you post your patch to P5P; the pumpking needs to know this. +Either assimilate new technologies, or build bridges to them. -When you write your new code, please be conscious of existing code -conventions used in the perl source files. See L<perlstyle> for -details. Although most of the guidelines discussed seem to focus on -Perl code, rather than c, they all apply (except when they don't ;). -Also see I<perlrepository> for lots of details about both formatting and -submitting patches of your changes. +=back -Lastly, TEST TEST TEST TEST TEST any code before posting to p5p. -Test on as many platforms as you can find. Test as many perl -Configure options as you can (e.g. MULTIPLICITY). If you have -profiling or memory tools, see L<EXTERNAL TOOLS FOR DEBUGGING PERL> -below for how to use them to further test your code. Remember that -most of the people on P5P are doing this on their own time and -don't have the time to debug your code. +=head3 Where is the implementation? -=head2 Writing a test +All the talk in the world is useless without an implementation. In +almost every case, the person or people who argue for a new feature +will be expected to be the ones who implement it. Porters capable of +coding new features have their own agendas, and are not available to +implement your (possibly good) idea. -Every module and built-in function has an associated test file (or -should...). If you add or change functionality, you have to write a -test. If you fix a bug, you have to write a test so that bug never -comes back. If you alter the docs, it would be nice to test what the -new documentation says. +=head3 Backwards compatibility -In short, if you submit a patch you probably also have to patch the -tests. +It's a cardinal sin to break existing Perl programs. New warnings can +be contentious--some say that a program that emits warnings is not +broken, while others say it is. Adding keywords has the potential to +break programs, changing the meaning of existing token sequences or +functions might break programs. -For modules, the test file is right next to the module itself. -F<lib/strict.t> tests F<lib/strict.pm>. This is a recent innovation, -so there are some snags (and it would be wonderful for you to brush -them out), but it basically works that way. Everything else lives in -F<t/>. +The Perl 5 core includes mechanisms to help porters make backwards +incompatible changes more compatible such as the L<feature> and +L<deprecate> modules. Please use them when appropriate. -If you add a new test directory under F<t/>, it is imperative that you -add that directory to F<t/HARNESS> and F<t/TEST>. +=head3 Could it be a module instead? -=over 3 +Perl 5 has extension mechanisms, modules and XS, specifically to avoid +the need to keep changing the Perl interpreter. You can write modules +that export functions, you can give those functions prototypes so they +can be called like built-in functions, you can even write XS code to +mess with the runtime data structures of the Perl interpreter if you +want to implement really complicated things. -=item F<t/base/> +Whenever possible, new features should be prototyped in a CPAN module +before they will be considered for the core. -Testing of the absolute basic functionality of Perl. Things like -C<if>, basic file reads and writes, simple regexes, etc. These are -run first in the test suite and if any of them fail, something is -I<really> broken. +=head3 Is the feature generic enough? -=item F<t/cmd/> +Is this something that only the submitter wants added to the language, +or is it broadly useful? Sometimes, instead of adding a feature with a +tight focus, the porters might decide to wait until someone implements +the more generalized feature. -These test the basic control structures, C<if/else>, C<while>, -subroutines, etc. +=head3 Does it potentially introduce new bugs? -=item F<t/comp/> +Radical rewrites of large chunks of the Perl interpreter have the +potential to introduce new bugs. -Tests basic issues of how Perl parses and compiles itself. +=head3 How big is it? -=item F<t/io/> +The smaller and more localized the change, the better. Similarly, a +series of small patches is greatly preferred over a single large patch. -Tests for built-in IO functions, including command line arguments. +=head3 Does it preclude other desirable features? -=item F<t/lib/> +A patch is likely to be rejected if it closes off future avenues of +development. For instance, a patch that placed a true and final +interpretation on prototypes is likely to be rejected because there are +still options for the future of prototypes that haven't been addressed. -The old home for the module tests, you shouldn't put anything new in -here. There are still some bits and pieces hanging around in here -that need to be moved. Perhaps you could move them? Thanks! +=head3 Is the implementation robust? -=item F<t/mro/> +Good patches (tight code, complete, correct) stand more chance of going +in. Sloppy or incorrect patches might be placed on the back burner +until the pumpking has time to fix, or might be discarded altogether +without further notice. -Tests for perl's method resolution order implementations -(see L<mro>). +=head3 Is the implementation generic enough to be portable? -=item F<t/op/> +The worst patches make use of system-specific features. It's highly +unlikely that non-portable additions to the Perl language will be +accepted. -Tests for perl's built in functions that don't fit into any of the -other directories. +=head3 Is the implementation tested? -=item F<t/re/> +Patches which change behaviour (fixing bugs or introducing new +features) must include regression tests to verify that everything works +as expected. -Tests for regex related functions or behaviour. (These used to live -in t/op). +Without tests provided by the original author, how can anyone else +changing perl in the future be sure that they haven't unwittingly +broken the behaviour the patch implements? And without tests, how can +the patch's author be confident that his/her hard work put into the +patch won't be accidentally thrown away by someone in the future? -=item F<t/run/> +=head3 Is there enough documentation? -Testing features of how perl actually runs, including exit codes and -handling of PERL* environment variables. +Patches without documentation are probably ill-thought out or +incomplete. No features can be added or changed without documentation, +so submitting a patch for the appropriate pod docs as well as the +source code is important. -=item F<t/uni/> +=head3 Is there another way to do it? -Tests for the core support of Unicode. +Larry said "Although the Perl Slogan is I<There's More Than One Way to +Do It>, I hesitate to make 10 ways to do something". This is a tricky +heuristic to navigate, though--one man's essential addition is another +man's pointless cruft. -=item F<t/win32/> +=head3 Does it create too much work? -Windows-specific tests. +Work for the pumpking, work for Perl programmers, work for module +authors, ... Perl is supposed to be easy. -=item F<t/x2p> +=head3 Patches speak louder than words -A test suite for the s2p converter. +Working code is always preferred to pie-in-the-sky ideas. A patch to +add a feature stands a much higher chance of making it to the language +than does a random feature request, no matter how fervently argued the +request might be. This ties into "Will it be useful?", as the fact that +someone took the time to make the patch demonstrates a strong desire +for the feature. -=back +=head1 TESTING The core uses the same testing style as the rest of Perl, a simple "ok/not ok" run through Test::Harness, but there are a few special considerations. -There are three ways to write a test in the core. Test::More, -t/test.pl and ad hoc C<print $test ? "ok 42\n" : "not ok 42\n">. The +There are three ways to write a test in the core. L<Test::More>, +F<t/test.pl> and ad hoc C<print $test ? "ok 42\n" : "not ok 42\n">. The decision of which to use depends on what part of the test suite you're -working on. This is a measure to prevent a high-level failure (such -as Config.pm breaking) from causing basic functionality tests to fail. -If you write your own test, use the L<Test Anything Protocol|TAP>. +working on. This is a measure to prevent a high-level failure (such as +Config.pm breaking) from causing basic functionality tests to fail. + +The F<t/test.pl> library provides some of the features of +L<Test::More>, but avoids loading most modules and uses as few core +features as possible. + +If you write your own test, use the L<Test Anything +Protocol|http://testanything.org>. =over 4 -=item t/base t/comp +=item * F<t/base> and F<t/comp> Since we don't know if require works, or even subroutines, use ad hoc -tests for these two. Step carefully to avoid using the feature being +tests for these two. Step carefully to avoid using the feature being tested. -=item t/cmd t/run t/io t/op +=item * F<t/cmd>, F<t/run>, F<t/io> and F<t/op> Now that basic require() and subroutines are tested, you can use the -t/test.pl library which emulates the important features of Test::More -while using a minimum of core features. +F<t/test.pl> library. -You can also conditionally use certain libraries like Config, but be +You can also use certain libraries like Config conditionally, but be sure to skip the test gracefully if it's not there. -=item t/lib ext lib +=item * Everything else -Now that the core of Perl is tested, Test::More can be used. You can -also use the full suite of core modules in the tests. +Now that the core of Perl is tested, L<Test::More> can and should be +used. You can also use the full suite of core modules in the tests. =back -When you say "make test" Perl uses the F<t/TEST> program to run the -test suite (except under Win32 where it uses F<t/harness> instead.) -All tests are run from the F<t/> directory, B<not> the directory -which contains the test. This causes some problems with the tests -in F<lib/>, so here's some opportunity for some patching. +When you say "make test", Perl uses the F<t/TEST> program to run the +test suite (except under Win32 where it uses F<t/harness> instead). All +tests are run from the F<t/> directory, B<not> the directory which +contains the test. This causes some problems with the tests in F<lib/>, +so here's some opportunity for some patching. -You must be triply conscious of cross-platform concerns. This usually -boils down to using File::Spec and avoiding things like C<fork()> and -C<system()> unless absolutely necessary. +You must be triply conscious of cross-platform concerns. This usually +boils down to using L<File::Spec> and avoiding things like C<fork()> +and C<system()> unless absolutely necessary. -=head2 Special Make Test Targets +=head2 Special C<make test> targets There are various special make targets that can be used to test Perl -slightly differently than the standard "test" target. Not all them -are expected to give a 100% success rate. Many of them have several +slightly differently than the standard "test" target. Not all them are +expected to give a 100% success rate. Many of them have several aliases, and many of them are not available on certain operating systems. =over 4 -=item coretest +=item * test_porting + +This runs some basic sanity tests on the source tree and helps catch +basic errors before you submit a patch. + +=item * coretest Run F<perl> on all core tests (F<t/*> and F<lib/[a-z]*> pragma tests). (Not available on Win32) -=item test.deparse +=item * test.deparse -Run all the tests through B::Deparse. Not all tests will succeed. +Run all the tests through L<B::Deparse>. Not all tests will succeed. (Not available on Win32) -=item test.taintwarn +=item * test.taintwarn -Run all tests with the B<-t> command-line switch. Not all tests -are expected to succeed (until they're specifically fixed, of course). +Run all tests with the B<-t> command-line switch. Not all tests are +expected to succeed (until they're specifically fixed, of course). (Not available on Win32) -=item minitest +=item * minitest Run F<miniperl> on F<t/base>, F<t/comp>, F<t/cmd>, F<t/run>, F<t/io>, F<t/op>, F<t/uni> and F<t/mro> tests. -=item test.valgrind check.valgrind utest.valgrind ucheck.valgrind +=item * test.valgrind check.valgrind utest.valgrind ucheck.valgrind (Only in Linux) Run all the tests using the memory leak + naughty -memory access tool "valgrind". The log files will be named +memory access tool "valgrind". The log files will be named F<testname.valgrind>. -=item test.third check.third utest.third ucheck.third - -(Only in Tru64) Run all the tests using the memory leak + naughty -memory access tool "Third Degree". The log files will be named -F<perl.3log.testname>. +=item * test.torture torturetest -=item test.torture torturetest - -Run all the usual tests and some extra tests. As of Perl 5.8.0 the +Run all the usual tests and some extra tests. As of Perl 5.8.0, the only extra tests are Abigail's JAPHs, F<t/japh/abigail.t>. You can also run the torture test with F<t/harness> by giving C<-torture> argument to F<t/harness>. -=item utest ucheck test.utf8 check.utf8 +=item * utest ucheck test.utf8 check.utf8 -Run all the tests with -Mutf8. Not all tests will succeed. +Run all the tests with -Mutf8. Not all tests will succeed. (Not available on Win32) -=item minitest.utf16 test.utf16 +=item * minitest.utf16 test.utf16 Runs the tests with UTF-16 encoded scripts, encoded with different versions of this encoding. -C<make utest.utf16> runs the test suite with a combination of C<-utf8> and -C<-utf16> arguments to F<t/TEST>. +C<make utest.utf16> runs the test suite with a combination of C<-utf8> +and C<-utf16> arguments to F<t/TEST>. (Not available on Win32) -=item test_harness +=item * test_harness -Run the test suite with the F<t/harness> controlling program, instead of -F<t/TEST>. F<t/harness> is more sophisticated, and uses the +Run the test suite with the F<t/harness> controlling program, instead +of F<t/TEST>. F<t/harness> is more sophisticated, and uses the L<Test::Harness> module, thus using this test target supposes that perl mostly works. The main advantage for our purposes is that it prints a detailed summary of failed tests at the end. Also, unlike F<t/TEST>, it doesn't redirect stderr to stdout. -Note that under Win32 F<t/harness> is always used instead of F<t/TEST>, so -there is no special "test_harness" target. +Note that under Win32 F<t/harness> is always used instead of F<t/TEST>, +so there is no special "test_harness" target. -Under Win32's "test" target you may use the TEST_SWITCHES and TEST_FILES -environment variables to control the behaviour of F<t/harness>. This means -you can say +Under Win32's "test" target you may use the TEST_SWITCHES and +TEST_FILES environment variables to control the behaviour of +F<t/harness>. This means you can say nmake test TEST_FILES="op/*.t" nmake test TEST_SWITCHES="-torture" TEST_FILES="op/*.t" -=item Parallel tests +=item * test-notty test_notty + +Sets PERL_SKIP_TTY_TEST to true before running normal test. + +=back + +=head2 Parallel tests The core distribution can now run its regression tests in parallel on -Unix-like platforms. Instead of running C<make test>, set C<TEST_JOBS> in -your environment to the number of tests to run in parallel, and run +Unix-like platforms. Instead of running C<make test>, set C<TEST_JOBS> +in your environment to the number of tests to run in parallel, and run C<make test_harness>. On a Bourne-like shell, this can be done as TEST_JOBS=3 make test_harness # Run 3 tests in parallel -An environment variable is used, rather than parallel make itself, because -L<TAP::Harness> needs to be able to schedule individual non-conflicting test -scripts itself, and there is no standard interface to C<make> utilities to -interact with their job schedulers. +An environment variable is used, rather than parallel make itself, +because L<TAP::Harness> needs to be able to schedule individual +non-conflicting test scripts itself, and there is no standard interface +to C<make> utilities to interact with their job schedulers. -Note that currently some test scripts may fail when run in parallel (most -notably C<ext/IO/t/io_dir.t>). If necessary run just the failing scripts -again sequentially and see if the failures go away. -=item test-notty test_notty - -Sets PERL_SKIP_TTY_TEST to true before running normal test. - -=back +Note that currently some test scripts may fail when run in parallel +(most notably F<ext/IO/t/io_dir.t>). If necessary, run just the failing +scripts again sequentially and see if the failures go away. =head2 Running tests by hand -You can run part of the test suite by hand by using one the following -commands from the F<t/> directory : +You can run part of the test suite by hand by using one of the +following commands from the F<t/> directory: ./perl -I../lib TEST list-of-.t-files @@ -1990,71 +856,73 @@ or ./perl -I../lib harness list-of-.t-files -(if you don't specify test scripts, the whole test suite will be run.) +(If you don't specify test scripts, the whole test suite will be run.) -=head3 Using t/harness for testing +=head2 Using F<t/harness> for testing -If you use C<harness> for testing you have several command line options -available to you. The arguments are as follows, and are in the order -that they must appear if used together. +If you use C<harness> for testing, you have several command line +options available to you. The arguments are as follows, and are in the +order that they must appear if used together. harness -v -torture -re=pattern LIST OF FILES TO TEST harness -v -torture -re LIST OF PATTERNS TO MATCH -If C<LIST OF FILES TO TEST> is omitted the file list is obtained from +If C<LIST OF FILES TO TEST> is omitted, the file list is obtained from the manifest. The file list may include shell wildcards which will be expanded out. =over 4 -=item -v +=item * -v Run the tests under verbose mode so you can see what tests were run, and debug output. -=item -torture +=item * -torture Run the torture tests as well as the normal set. -=item -re=PATTERN +=item * -re=PATTERN -Filter the file list so that all the test files run match PATTERN. -Note that this form is distinct from the B<-re LIST OF PATTERNS> form below +Filter the file list so that all the test files run match PATTERN. Note +that this form is distinct from the B<-re LIST OF PATTERNS> form below in that it allows the file list to be provided as well. -=item -re LIST OF PATTERNS +=item * -re LIST OF PATTERNS Filter the file list so that all the test files run match -/(LIST|OF|PATTERNS)/. Note that with this form the patterns -are joined by '|' and you cannot supply a list of files, instead -the test files are obtained from the MANIFEST. +/(LIST|OF|PATTERNS)/. Note that with this form the patterns are joined +by '|' and you cannot supply a list of files, instead the test files +are obtained from the MANIFEST. =back You can run an individual test by a command similar to - ./perl -I../lib patho/to/foo.t + ./perl -I../lib path/to/foo.t except that the harnesses set up some environment variables that may -affect the execution of the test : +affect the execution of the test: =over 4 -=item PERL_CORE=1 +=item * PERL_CORE=1 -indicates that we're running this test part of the perl core test suite. -This is useful for modules that have a dual life on CPAN. +indicates that we're running this test as part of the perl core test +suite. This is useful for modules that have a dual life on CPAN. -=item PERL_DESTRUCT_LEVEL=2 +=item * PERL_DESTRUCT_LEVEL=2 -is set to 2 if it isn't set already (see L</PERL_DESTRUCT_LEVEL>) +is set to 2 if it isn't set already (see +L<perlhacktips/PERL_DESTRUCT_LEVEL>). -=item PERL +=item * PERL -(used only by F<t/TEST>) if set, overrides the path to the perl executable -that should be used to run the tests (the default being F<./perl>). +(used only by F<t/TEST>) if set, overrides the path to the perl +executable that should be used to run the tests (the default being +F<./perl>). -=item PERL_SKIP_TTY_TEST +=item * PERL_SKIP_TTY_TEST if set, tells to skip the tests that need a terminal. It's actually set automatically by the Makefile, but can also be forced artificially by @@ -2066,1331 +934,211 @@ running 'make test_notty'. =over 4 -=item PERL_TEST_Net_Ping +=item * PERL_TEST_Net_Ping -Setting this variable runs all the Net::Ping modules tests, -otherwise some tests that interact with the outside world are skipped. -See L<perl58delta>. +Setting this variable runs all the Net::Ping modules tests, otherwise +some tests that interact with the outside world are skipped. See +L<perl58delta>. -=item PERL_TEST_NOVREXX +=item * PERL_TEST_NOVREXX Setting this variable skips the vrexx.t tests for OS2::REXX. -=item PERL_TEST_NUMCONVERTS +=item * PERL_TEST_NUMCONVERTS This sets a variable in op/numconvert.t. =back -See also the documentation for the Test and Test::Harness modules, -for more environment variables that affect testing. - -=head2 Common problems when patching Perl source code - -Perl source plays by ANSI C89 rules: no C99 (or C++) extensions. In -some cases we have to take pre-ANSI requirements into consideration. -You don't care about some particular platform having broken Perl? -I hear there is still a strong demand for J2EE programmers. - -=head2 Perl environment problems - -=over 4 - -=item * - -Not compiling with threading - -Compiling with threading (-Duseithreads) completely rewrites -the function prototypes of Perl. You better try your changes -with that. Related to this is the difference between "Perl_-less" -and "Perl_-ly" APIs, for example: - - Perl_sv_setiv(aTHX_ ...); - sv_setiv(...); - -The first one explicitly passes in the context, which is needed for e.g. -threaded builds. The second one does that implicitly; do not get them -mixed. If you are not passing in a aTHX_, you will need to do a dTHX -(or a dVAR) as the first thing in the function. - -See L<perlguts/"How multiple interpreters and concurrency are supported"> -for further discussion about context. - -=item * - -Not compiling with -DDEBUGGING - -The DEBUGGING define exposes more code to the compiler, -therefore more ways for things to go wrong. You should try it. - -=item * - -Introducing (non-read-only) globals - -Do not introduce any modifiable globals, truly global or file static. -They are bad form and complicate multithreading and other forms of -concurrency. The right way is to introduce them as new interpreter -variables, see F<intrpvar.h> (at the very end for binary compatibility). - -Introducing read-only (const) globals is okay, as long as you verify -with e.g. C<nm libperl.a|egrep -v ' [TURtr] '> (if your C<nm> has -BSD-style output) that the data you added really is read-only. -(If it is, it shouldn't show up in the output of that command.) - -If you want to have static strings, make them constant: - - static const char etc[] = "..."; - -If you want to have arrays of constant strings, note carefully -the right combination of C<const>s: - - static const char * const yippee[] = - {"hi", "ho", "silver"}; - -There is a way to completely hide any modifiable globals (they are all -moved to heap), the compilation setting C<-DPERL_GLOBAL_STRUCT_PRIVATE>. -It is not normally used, but can be used for testing, read more -about it in L<perlguts/"Background and PERL_IMPLICIT_CONTEXT">. - -=item * - -Not exporting your new function - -Some platforms (Win32, AIX, VMS, OS/2, to name a few) require any -function that is part of the public API (the shared Perl library) -to be explicitly marked as exported. See the discussion about -F<embed.pl> in L<perlguts>. - -=item * - -Exporting your new function - -The new shiny result of either genuine new functionality or your -arduous refactoring is now ready and correctly exported. So what -could possibly go wrong? - -Maybe simply that your function did not need to be exported in the -first place. Perl has a long and not so glorious history of exporting -functions that it should not have. - -If the function is used only inside one source code file, make it -static. See the discussion about F<embed.pl> in L<perlguts>. - -If the function is used across several files, but intended only for -Perl's internal use (and this should be the common case), do not -export it to the public API. See the discussion about F<embed.pl> -in L<perlguts>. - -=back - -=head2 Portability problems - -The following are common causes of compilation and/or execution -failures, not common to Perl as such. The C FAQ is good bedtime -reading. Please test your changes with as many C compilers and -platforms as possible; we will, anyway, and it's nice to save -oneself from public embarrassment. - -If using gcc, you can add the C<-std=c89> option which will hopefully -catch most of these unportabilities. (However it might also catch -incompatibilities in your system's header files.) - -Use the Configure C<-Dgccansipedantic> flag to enable the gcc -C<-ansi -pedantic> flags which enforce stricter ANSI rules. - -If using the C<gcc -Wall> note that not all the possible warnings -(like C<-Wunitialized>) are given unless you also compile with C<-O>. - -Note that if using gcc, starting from Perl 5.9.5 the Perl core source -code files (the ones at the top level of the source code distribution, -but not e.g. the extensions under ext/) are automatically compiled -with as many as possible of the C<-std=c89>, C<-ansi>, C<-pedantic>, -and a selection of C<-W> flags (see cflags.SH). - -Also study L<perlport> carefully to avoid any bad assumptions -about the operating system, filesystems, and so forth. - -You may once in a while try a "make microperl" to see whether we -can still compile Perl with just the bare minimum of interfaces. -(See README.micro.) - -Do not assume an operating system indicates a certain compiler. - -=over 4 - -=item * - -Casting pointers to integers or casting integers to pointers - - void castaway(U8* p) - { - IV i = p; - -or - - void castaway(U8* p) - { - IV i = (IV)p; - -Both are bad, and broken, and unportable. Use the PTR2IV() -macro that does it right. (Likewise, there are PTR2UV(), PTR2NV(), -INT2PTR(), and NUM2PTR().) - -=item * - -Casting between data function pointers and data pointers - -Technically speaking casting between function pointers and data -pointers is unportable and undefined, but practically speaking -it seems to work, but you should use the FPTR2DPTR() and DPTR2FPTR() -macros. Sometimes you can also play games with unions. - -=item * - -Assuming sizeof(int) == sizeof(long) - -There are platforms where longs are 64 bits, and platforms where ints -are 64 bits, and while we are out to shock you, even platforms where -shorts are 64 bits. This is all legal according to the C standard. -(In other words, "long long" is not a portable way to specify 64 bits, -and "long long" is not even guaranteed to be any wider than "long".) - -Instead, use the definitions IV, UV, IVSIZE, I32SIZE, and so forth. -Avoid things like I32 because they are B<not> guaranteed to be -I<exactly> 32 bits, they are I<at least> 32 bits, nor are they -guaranteed to be B<int> or B<long>. If you really explicitly need -64-bit variables, use I64 and U64, but only if guarded by HAS_QUAD. - -=item * - -Assuming one can dereference any type of pointer for any type of data - - char *p = ...; - long pony = *p; /* BAD */ - -Many platforms, quite rightly so, will give you a core dump instead -of a pony if the p happens not be correctly aligned. - -=item * - -Lvalue casts - - (int)*p = ...; /* BAD */ - -Simply not portable. Get your lvalue to be of the right type, -or maybe use temporary variables, or dirty tricks with unions. - -=item * - -Assume B<anything> about structs (especially the ones you -don't control, like the ones coming from the system headers) - -=over 8 - -=item * - -That a certain field exists in a struct - -=item * - -That no other fields exist besides the ones you know of - -=item * - -That a field is of certain signedness, sizeof, or type - -=item * - -That the fields are in a certain order - -=over 8 - -=item * - -While C guarantees the ordering specified in the struct definition, -between different platforms the definitions might differ - -=back - -=item * - -That the sizeof(struct) or the alignments are the same everywhere - -=over 8 - -=item * - -There might be padding bytes between the fields to align the fields - -the bytes can be anything - -=item * - -Structs are required to be aligned to the maximum alignment required -by the fields - which for native types is for usually equivalent to -sizeof() of the field - -=back - -=back - -=item * - -Assuming the character set is ASCIIish - -Perl can compile and run under EBCDIC platforms. See L<perlebcdic>. -This is transparent for the most part, but because the character sets -differ, you shouldn't use numeric (decimal, octal, nor hex) constants -to refer to characters. You can safely say 'A', but not 0x41. -You can safely say '\n', but not \012. -If a character doesn't have a trivial input form, you can -create a #define for it in both C<utfebcdic.h> and C<utf8.h>, so that -it resolves to different values depending on the character set being used. -(There are three different EBCDIC character sets defined in C<utfebcdic.h>, -so it might be best to insert the #define three times in that file.) - -Also, the range 'A' - 'Z' in ASCII is an unbroken sequence of 26 upper case -alphabetic characters. That is not true in EBCDIC. Nor for 'a' to 'z'. -But '0' - '9' is an unbroken range in both systems. Don't assume anything -about other ranges. - -Many of the comments in the existing code ignore the possibility of EBCDIC, -and may be wrong therefore, even if the code works. -This is actually a tribute to the successful transparent insertion of being -able to handle EBCDIC without having to change pre-existing code. - -UTF-8 and UTF-EBCDIC are two different encodings used to represent Unicode -code points as sequences of bytes. Macros -with the same names (but different definitions) -in C<utf8.h> and C<utfebcdic.h> -are used to allow the calling code to think that there is only one such -encoding. -This is almost always referred to as C<utf8>, but it means the EBCDIC version -as well. Again, comments in the code may well be wrong even if the code itself -is right. -For example, the concept of C<invariant characters> differs between ASCII and -EBCDIC. -On ASCII platforms, only characters that do not have the high-order -bit set (i.e. whose ordinals are strict ASCII, 0 - 127) -are invariant, and the documentation and comments in the code -may assume that, -often referring to something like, say, C<hibit>. -The situation differs and is not so simple on EBCDIC machines, but as long as -the code itself uses the C<NATIVE_IS_INVARIANT()> macro appropriately, it -works, even if the comments are wrong. - -=item * - -Assuming the character set is just ASCII - -ASCII is a 7 bit encoding, but bytes have 8 bits in them. The 128 extra -characters have different meanings depending on the locale. Absent a locale, -currently these extra characters are generally considered to be unassigned, -and this has presented some problems. -This is being changed starting in 5.12 so that these characters will -be considered to be Latin-1 (ISO-8859-1). - -=item * - -Mixing #define and #ifdef +See also the documentation for the Test and Test::Harness modules, for +more environment variables that affect testing. - #define BURGLE(x) ... \ - #ifdef BURGLE_OLD_STYLE /* BAD */ - ... do it the old way ... \ - #else - ... do it the new way ... \ - #endif +=head1 MORE READING FOR GUTS HACKERS -You cannot portably "stack" cpp directives. For example in the above -you need two separate BURGLE() #defines, one for each #ifdef branch. - -=item * - -Adding non-comment stuff after #endif or #else - - #ifdef SNOSH - ... - #else !SNOSH /* BAD */ - ... - #endif SNOSH /* BAD */ - -The #endif and #else cannot portably have anything non-comment after -them. If you want to document what is going (which is a good idea -especially if the branches are long), use (C) comments: - - #ifdef SNOSH - ... - #else /* !SNOSH */ - ... - #endif /* SNOSH */ - -The gcc option C<-Wendif-labels> warns about the bad variant -(by default on starting from Perl 5.9.4). - -=item * - -Having a comma after the last element of an enum list - - enum color { - CERULEAN, - CHARTREUSE, - CINNABAR, /* BAD */ - }; - -is not portable. Leave out the last comma. - -Also note that whether enums are implicitly morphable to ints -varies between compilers, you might need to (int). - -=item * - -Using //-comments - - // This function bamfoodles the zorklator. /* BAD */ - -That is C99 or C++. Perl is C89. Using the //-comments is silently -allowed by many C compilers but cranking up the ANSI C89 strictness -(which we like to do) causes the compilation to fail. - -=item * - -Mixing declarations and code - - void zorklator() - { - int n = 3; - set_zorkmids(n); /* BAD */ - int q = 4; - -That is C99 or C++. Some C compilers allow that, but you shouldn't. - -The gcc option C<-Wdeclaration-after-statements> scans for such problems -(by default on starting from Perl 5.9.4). - -=item * - -Introducing variables inside for() - - for(int i = ...; ...; ...) { /* BAD */ - -That is C99 or C++. While it would indeed be awfully nice to have that -also in C89, to limit the scope of the loop variable, alas, we cannot. - -=item * - -Mixing signed char pointers with unsigned char pointers - - int foo(char *s) { ... } - ... - unsigned char *t = ...; /* Or U8* t = ... */ - foo(t); /* BAD */ - -While this is legal practice, it is certainly dubious, and downright -fatal in at least one platform: for example VMS cc considers this a -fatal error. One cause for people often making this mistake is that a -"naked char" and therefore dereferencing a "naked char pointer" have -an undefined signedness: it depends on the compiler and the flags of -the compiler and the underlying platform whether the result is signed -or unsigned. For this very same reason using a 'char' as an array -index is bad. - -=item * - -Macros that have string constants and their arguments as substrings of -the string constants - - #define FOO(n) printf("number = %d\n", n) /* BAD */ - FOO(10); - -Pre-ANSI semantics for that was equivalent to - - printf("10umber = %d\10"); - -which is probably not what you were expecting. Unfortunately at least -one reasonably common and modern C compiler does "real backward -compatibility" here, in AIX that is what still happens even though the -rest of the AIX compiler is very happily C89. - -=item * - -Using printf formats for non-basic C types - - IV i = ...; - printf("i = %d\n", i); /* BAD */ - -While this might by accident work in some platform (where IV happens -to be an C<int>), in general it cannot. IV might be something larger. -Even worse the situation is with more specific types (defined by Perl's -configuration step in F<config.h>): - - Uid_t who = ...; - printf("who = %d\n", who); /* BAD */ - -The problem here is that Uid_t might be not only not C<int>-wide -but it might also be unsigned, in which case large uids would be -printed as negative values. - -There is no simple solution to this because of printf()'s limited -intelligence, but for many types the right format is available as -with either 'f' or '_f' suffix, for example: - - IVdf /* IV in decimal */ - UVxf /* UV is hexadecimal */ - - printf("i = %"IVdf"\n", i); /* The IVdf is a string constant. */ - - Uid_t_f /* Uid_t in decimal */ - - printf("who = %"Uid_t_f"\n", who); - -Or you can try casting to a "wide enough" type: - - printf("i = %"IVdf"\n", (IV)something_very_small_and_signed); - -Also remember that the C<%p> format really does require a void pointer: - - U8* p = ...; - printf("p = %p\n", (void*)p); - -The gcc option C<-Wformat> scans for such problems. - -=item * - -Blindly using variadic macros - -gcc has had them for a while with its own syntax, and C99 brought -them with a standardized syntax. Don't use the former, and use -the latter only if the HAS_C99_VARIADIC_MACROS is defined. - -=item * - -Blindly passing va_list - -Not all platforms support passing va_list to further varargs (stdarg) -functions. The right thing to do is to copy the va_list using the -Perl_va_copy() if the NEED_VA_COPY is defined. - -=item * - -Using gcc statement expressions - - val = ({...;...;...}); /* BAD */ - -While a nice extension, it's not portable. The Perl code does -admittedly use them if available to gain some extra speed -(essentially as a funky form of inlining), but you shouldn't. - -=item * - -Binding together several statements in a macro - -Use the macros STMT_START and STMT_END. - - STMT_START { - ... - } STMT_END - -=item * - -Testing for operating systems or versions when should be testing for features - - #ifdef __FOONIX__ /* BAD */ - foo = quux(); - #endif - -Unless you know with 100% certainty that quux() is only ever available -for the "Foonix" operating system B<and> that is available B<and> -correctly working for B<all> past, present, B<and> future versions of -"Foonix", the above is very wrong. This is more correct (though still -not perfect, because the below is a compile-time check): - - #ifdef HAS_QUUX - foo = quux(); - #endif - -How does the HAS_QUUX become defined where it needs to be? Well, if -Foonix happens to be Unixy enough to be able to run the Configure -script, and Configure has been taught about detecting and testing -quux(), the HAS_QUUX will be correctly defined. In other platforms, -the corresponding configuration step will hopefully do the same. - -In a pinch, if you cannot wait for Configure to be educated, -or if you have a good hunch of where quux() might be available, -you can temporarily try the following: - - #if (defined(__FOONIX__) || defined(__BARNIX__)) - # define HAS_QUUX - #endif - - ... - - #ifdef HAS_QUUX - foo = quux(); - #endif - -But in any case, try to keep the features and operating systems separate. - -=back - -=head2 Problematic System Interfaces - -=over 4 - -=item * - -malloc(0), realloc(0), calloc(0, 0) are non-portable. To be portable -allocate at least one byte. (In general you should rarely need to -work at this low level, but instead use the various malloc wrappers.) - -=item * - -snprintf() - the return type is unportable. Use my_snprintf() instead. - -=back - -=head2 Security problems - -Last but not least, here are various tips for safer coding. - -=over 4 - -=item * - -Do not use gets() - -Or we will publicly ridicule you. Seriously. - -=item * - -Do not use strcpy() or strcat() or strncpy() or strncat() - -Use my_strlcpy() and my_strlcat() instead: they either use the native -implementation, or Perl's own implementation (borrowed from the public -domain implementation of INN). - -=item * - -Do not use sprintf() or vsprintf() - -If you really want just plain byte strings, use my_snprintf() -and my_vsnprintf() instead, which will try to use snprintf() and -vsnprintf() if those safer APIs are available. If you want something -fancier than a plain byte string, use SVs and Perl_sv_catpvf(). - -=back - -=head1 EXTERNAL TOOLS FOR DEBUGGING PERL - -Sometimes it helps to use external tools while debugging and -testing Perl. This section tries to guide you through using -some common testing and debugging tools with Perl. This is -meant as a guide to interfacing these tools with Perl, not -as any kind of guide to the use of the tools themselves. - -B<NOTE 1>: Running under memory debuggers such as Purify, valgrind, or -Third Degree greatly slows down the execution: seconds become minutes, -minutes become hours. For example as of Perl 5.8.1, the -ext/Encode/t/Unicode.t takes extraordinarily long to complete under -e.g. Purify, Third Degree, and valgrind. Under valgrind it takes more -than six hours, even on a snappy computer. The said test must be -doing something that is quite unfriendly for memory debuggers. If you -don't feel like waiting, that you can simply kill away the perl -process. - -B<NOTE 2>: To minimize the number of memory leak false alarms (see -L</PERL_DESTRUCT_LEVEL> for more information), you have to set the -environment variable PERL_DESTRUCT_LEVEL to 2. - -For csh-like shells: - - setenv PERL_DESTRUCT_LEVEL 2 - -For Bourne-type shells: - - PERL_DESTRUCT_LEVEL=2 - export PERL_DESTRUCT_LEVEL - -In Unixy environments you can also use the C<env> command: - - env PERL_DESTRUCT_LEVEL=2 valgrind ./perl -Ilib ... - -B<NOTE 3>: There are known memory leaks when there are compile-time -errors within eval or require, seeing C<S_doeval> in the call stack -is a good sign of these. Fixing these leaks is non-trivial, -unfortunately, but they must be fixed eventually. - -B<NOTE 4>: L<DynaLoader> will not clean up after itself completely -unless Perl is built with the Configure option -C<-Accflags=-DDL_UNLOAD_ALL_AT_EXIT>. - -=head2 Rational Software's Purify - -Purify is a commercial tool that is helpful in identifying -memory overruns, wild pointers, memory leaks and other such -badness. Perl must be compiled in a specific way for -optimal testing with Purify. Purify is available under -Windows NT, Solaris, HP-UX, SGI, and Siemens Unix. - -=head2 Purify on Unix - -On Unix, Purify creates a new Perl binary. To get the most -benefit out of Purify, you should create the perl to Purify -using: - - sh Configure -Accflags=-DPURIFY -Doptimize='-g' \ - -Uusemymalloc -Dusemultiplicity - -where these arguments mean: - -=over 4 - -=item -Accflags=-DPURIFY - -Disables Perl's arena memory allocation functions, as well as -forcing use of memory allocation functions derived from the -system malloc. - -=item -Doptimize='-g' - -Adds debugging information so that you see the exact source -statements where the problem occurs. Without this flag, all -you will see is the source filename of where the error occurred. - -=item -Uusemymalloc - -Disable Perl's malloc so that Purify can more closely monitor -allocations and leaks. Using Perl's malloc will make Purify -report most leaks in the "potential" leaks category. - -=item -Dusemultiplicity - -Enabling the multiplicity option allows perl to clean up -thoroughly when the interpreter shuts down, which reduces the -number of bogus leak reports from Purify. - -=back - -Once you've compiled a perl suitable for Purify'ing, then you -can just: - - make pureperl - -which creates a binary named 'pureperl' that has been Purify'ed. -This binary is used in place of the standard 'perl' binary -when you want to debug Perl memory problems. - -As an example, to show any memory leaks produced during the -standard Perl testset you would create and run the Purify'ed -perl as: - - make pureperl - cd t - ../pureperl -I../lib harness - -which would run Perl on test.pl and report any memory problems. - -Purify outputs messages in "Viewer" windows by default. If -you don't have a windowing environment or if you simply -want the Purify output to unobtrusively go to a log file -instead of to the interactive window, use these following -options to output to the log file "perl.log": - - setenv PURIFYOPTIONS "-chain-length=25 -windows=no \ - -log-file=perl.log -append-logfile=yes" - -If you plan to use the "Viewer" windows, then you only need this option: - - setenv PURIFYOPTIONS "-chain-length=25" - -In Bourne-type shells: - - PURIFYOPTIONS="..." - export PURIFYOPTIONS - -or if you have the "env" utility: - - env PURIFYOPTIONS="..." ../pureperl ... - -=head2 Purify on NT - -Purify on Windows NT instruments the Perl binary 'perl.exe' -on the fly. There are several options in the makefile you -should change to get the most use out of Purify: - -=over 4 - -=item DEFINES - -You should add -DPURIFY to the DEFINES line so the DEFINES -line looks something like: - - DEFINES = -DWIN32 -D_CONSOLE -DNO_STRICT $(CRYPT_FLAG) -DPURIFY=1 - -to disable Perl's arena memory allocation functions, as -well as to force use of memory allocation functions derived -from the system malloc. - -=item USE_MULTI = define - -Enabling the multiplicity option allows perl to clean up -thoroughly when the interpreter shuts down, which reduces the -number of bogus leak reports from Purify. - -=item #PERL_MALLOC = define - -Disable Perl's malloc so that Purify can more closely monitor -allocations and leaks. Using Perl's malloc will make Purify -report most leaks in the "potential" leaks category. - -=item CFG = Debug - -Adds debugging information so that you see the exact source -statements where the problem occurs. Without this flag, all -you will see is the source filename of where the error occurred. - -=back - -As an example, to show any memory leaks produced during the -standard Perl testset you would create and run Purify as: - - cd win32 - make - cd ../t - purify ../perl -I../lib harness - -which would instrument Perl in memory, run Perl on test.pl, -then finally report any memory problems. - -=head2 valgrind - -The excellent valgrind tool can be used to find out both memory leaks -and illegal memory accesses. As of version 3.3.0, Valgrind only -supports Linux on x86, x86-64 and PowerPC. The special "test.valgrind" -target can be used to run the tests under valgrind. Found errors -and memory leaks are logged in files named F<testfile.valgrind>. - -Valgrind also provides a cachegrind tool, invoked on perl as: - - VG_OPTS=--tool=cachegrind make test.valgrind - -As system libraries (most notably glibc) are also triggering errors, -valgrind allows to suppress such errors using suppression files. The -default suppression file that comes with valgrind already catches a lot -of them. Some additional suppressions are defined in F<t/perl.supp>. - -To get valgrind and for more information see - - http://developer.kde.org/~sewardj/ - -=head2 Compaq's/Digital's/HP's Third Degree - -Third Degree is a tool for memory leak detection and memory access checks. -It is one of the many tools in the ATOM toolkit. The toolkit is only -available on Tru64 (formerly known as Digital UNIX formerly known as -DEC OSF/1). - -When building Perl, you must first run Configure with -Doptimize=-g -and -Uusemymalloc flags, after that you can use the make targets -"perl.third" and "test.third". (What is required is that Perl must be -compiled using the C<-g> flag, you may need to re-Configure.) - -The short story is that with "atom" you can instrument the Perl -executable to create a new executable called F<perl.third>. When the -instrumented executable is run, it creates a log of dubious memory -traffic in file called F<perl.3log>. See the manual pages of atom and -third for more information. The most extensive Third Degree -documentation is available in the Compaq "Tru64 UNIX Programmer's -Guide", chapter "Debugging Programs with Third Degree". - -The "test.third" leaves a lot of files named F<foo_bar.3log> in the t/ -subdirectory. There is a problem with these files: Third Degree is so -effective that it finds problems also in the system libraries. -Therefore you should used the Porting/thirdclean script to cleanup -the F<*.3log> files. - -There are also leaks that for given certain definition of a leak, -aren't. See L</PERL_DESTRUCT_LEVEL> for more information. - -=head2 PERL_DESTRUCT_LEVEL - -If you want to run any of the tests yourself manually using e.g. -valgrind, or the pureperl or perl.third executables, please note that -by default perl B<does not> explicitly cleanup all the memory it has -allocated (such as global memory arenas) but instead lets the exit() -of the whole program "take care" of such allocations, also known as -"global destruction of objects". - -There is a way to tell perl to do complete cleanup: set the -environment variable PERL_DESTRUCT_LEVEL to a non-zero value. -The t/TEST wrapper does set this to 2, and this is what you -need to do too, if you don't want to see the "global leaks": -For example, for "third-degreed" Perl: - - env PERL_DESTRUCT_LEVEL=2 ./perl.third -Ilib t/foo/bar.t - -(Note: the mod_perl apache module uses also this environment variable -for its own purposes and extended its semantics. Refer to the mod_perl -documentation for more information. Also, spawned threads do the -equivalent of setting this variable to the value 1.) - -If, at the end of a run you get the message I<N scalars leaked>, you can -recompile with C<-DDEBUG_LEAKING_SCALARS>, which will cause the addresses -of all those leaked SVs to be dumped along with details as to where each -SV was originally allocated. This information is also displayed by -Devel::Peek. Note that the extra details recorded with each SV increases -memory usage, so it shouldn't be used in production environments. It also -converts C<new_SV()> from a macro into a real function, so you can use -your favourite debugger to discover where those pesky SVs were allocated. - -If you see that you're leaking memory at runtime, but neither valgrind -nor C<-DDEBUG_LEAKING_SCALARS> will find anything, you're probably -leaking SVs that are still reachable and will be properly cleaned up -during destruction of the interpreter. In such cases, using the C<-Dm> -switch can point you to the source of the leak. If the executable was -built with C<-DDEBUG_LEAKING_SCALARS>, C<-Dm> will output SV allocations -in addition to memory allocations. Each SV allocation has a distinct -serial number that will be written on creation and destruction of the SV. -So if you're executing the leaking code in a loop, you need to look for -SVs that are created, but never destroyed between each cycle. If such an -SV is found, set a conditional breakpoint within C<new_SV()> and make it -break only when C<PL_sv_serial> is equal to the serial number of the -leaking SV. Then you will catch the interpreter in exactly the state -where the leaking SV is allocated, which is sufficient in many cases to -find the source of the leak. - -As C<-Dm> is using the PerlIO layer for output, it will by itself -allocate quite a bunch of SVs, which are hidden to avoid recursion. -You can bypass the PerlIO layer if you use the SV logging provided -by C<-DPERL_MEM_LOG> instead. - -=head2 PERL_MEM_LOG - -If compiled with C<-DPERL_MEM_LOG>, both memory and SV allocations go -through logging functions, which is handy for breakpoint setting. - -Unless C<-DPERL_MEM_LOG_NOIMPL> is also compiled, the logging -functions read $ENV{PERL_MEM_LOG} to determine whether to log the -event, and if so how: - - $ENV{PERL_MEM_LOG} =~ /m/ Log all memory ops - $ENV{PERL_MEM_LOG} =~ /s/ Log all SV ops - $ENV{PERL_MEM_LOG} =~ /t/ include timestamp in Log - $ENV{PERL_MEM_LOG} =~ /^(\d+)/ write to FD given (default is 2) - -Memory logging is somewhat similar to C<-Dm> but is independent of -C<-DDEBUGGING>, and at a higher level; all uses of Newx(), Renew(), -and Safefree() are logged with the caller's source code file and line -number (and C function name, if supported by the C compiler). In -contrast, C<-Dm> is directly at the point of C<malloc()>. SV logging -is similar. - -Since the logging doesn't use PerlIO, all SV allocations are logged -and no extra SV allocations are introduced by enabling the logging. -If compiled with C<-DDEBUG_LEAKING_SCALARS>, the serial number for -each SV allocation is also logged. - -=head2 Profiling - -Depending on your platform there are various of profiling Perl. - -There are two commonly used techniques of profiling executables: -I<statistical time-sampling> and I<basic-block counting>. - -The first method takes periodically samples of the CPU program -counter, and since the program counter can be correlated with the code -generated for functions, we get a statistical view of in which -functions the program is spending its time. The caveats are that very -small/fast functions have lower probability of showing up in the -profile, and that periodically interrupting the program (this is -usually done rather frequently, in the scale of milliseconds) imposes -an additional overhead that may skew the results. The first problem -can be alleviated by running the code for longer (in general this is a -good idea for profiling), the second problem is usually kept in guard -by the profiling tools themselves. - -The second method divides up the generated code into I<basic blocks>. -Basic blocks are sections of code that are entered only in the -beginning and exited only at the end. For example, a conditional jump -starts a basic block. Basic block profiling usually works by -I<instrumenting> the code by adding I<enter basic block #nnnn> -book-keeping code to the generated code. During the execution of the -code the basic block counters are then updated appropriately. The -caveat is that the added extra code can skew the results: again, the -profiling tools usually try to factor their own effects out of the -results. - -=head2 Gprof Profiling - -gprof is a profiling tool available in many Unix platforms, -it uses F<statistical time-sampling>. - -You can build a profiled version of perl called "perl.gprof" by -invoking the make target "perl.gprof" (What is required is that Perl -must be compiled using the C<-pg> flag, you may need to re-Configure). -Running the profiled version of Perl will create an output file called -F<gmon.out> is created which contains the profiling data collected -during the execution. - -The gprof tool can then display the collected data in various ways. -Usually gprof understands the following options: +To hack on the Perl guts, you'll need to read the following things: =over 4 -=item -a - -Suppress statically defined functions from the profile. - -=item -b - -Suppress the verbose descriptions in the profile. - -=item -e routine - -Exclude the given routine and its descendants from the profile. - -=item -f routine - -Display only the given routine and its descendants in the profile. - -=item -s - -Generate a summary file called F<gmon.sum> which then may be given -to subsequent gprof runs to accumulate data over several runs. +=item * L<perlsource> -=item -z +An overview of the Perl source tree. This will help you find the files +you're looking for. -Display routines that have zero usage. +=item * L<perlinterp> -=back - -For more detailed explanation of the available commands and output -formats, see your own local documentation of gprof. - -quick hint: - - $ sh Configure -des -Dusedevel -Doptimize='-pg' && make perl.gprof - $ ./perl.gprof someprog # creates gmon.out in current directory - $ gprof ./perl.gprof > out - $ view out - -=head2 GCC gcov Profiling - -Starting from GCC 3.0 I<basic block profiling> is officially available -for the GNU CC. - -You can build a profiled version of perl called F<perl.gcov> by -invoking the make target "perl.gcov" (what is required that Perl must -be compiled using gcc with the flags C<-fprofile-arcs --ftest-coverage>, you may need to re-Configure). - -Running the profiled version of Perl will cause profile output to be -generated. For each source file an accompanying ".da" file will be -created. - -To display the results you use the "gcov" utility (which should -be installed if you have gcc 3.0 or newer installed). F<gcov> is -run on source code files, like this +An overview of the Perl interpreter source code and some details on how +Perl does what it does. - gcov sv.c +=item * L<perlhacktut> -which will cause F<sv.c.gcov> to be created. The F<.gcov> files -contain the source code annotated with relative frequencies of -execution indicated by "#" markers. +This document walks through the creation of a small patch to Perl's C +code. If you're just getting started with Perl core hacking, this will +help you understand how it works. -Useful options of F<gcov> include C<-b> which will summarise the -basic block, branch, and function call coverage, and C<-c> which -instead of relative frequencies will use the actual counts. For -more information on the use of F<gcov> and basic block profiling -with gcc, see the latest GNU CC manual, as of GCC 3.0 see +=item * L<perlhacktips> - http://gcc.gnu.org/onlinedocs/gcc-3.0/gcc.html +More details on hacking the Perl core. This document focuses on lower +level details such as how to write tests, compilation issues, +portability, debugging, etc. -and its section titled "8. gcov: a Test Coverage Program" +If you plan on doing serious C hacking, make sure to read this. - http://gcc.gnu.org/onlinedocs/gcc-3.0/gcc_8.html#SEC132 +=item * L<perlguts> -quick hint: - - $ sh Configure -des -Doptimize='-g' -Accflags='-fprofile-arcs -ftest-coverage' \ - -Aldflags='-fprofile-arcs -ftest-coverage' && make perl.gcov - $ rm -f regexec.c.gcov regexec.gcda - $ ./perl.gcov - $ gcov regexec.c - $ view regexec.c.gcov - -=head2 Pixie Profiling +This is of paramount importance, since it's the documentation of what +goes where in the Perl source. Read it over a couple of times and it +might start to make sense - don't worry if it doesn't yet, because the +best way to study it is to read it in conjunction with poking at Perl +source, and we'll do that later on. -Pixie is a profiling tool available on IRIX and Tru64 (aka Digital -UNIX aka DEC OSF/1) platforms. Pixie does its profiling using -I<basic-block counting>. +Gisle Aas's "illustrated perlguts", also known as I<illguts>, has very +helpful pictures: -You can build a profiled version of perl called F<perl.pixie> by -invoking the make target "perl.pixie" (what is required is that Perl -must be compiled using the C<-g> flag, you may need to re-Configure). +L<http://search.cpan.org/dist/illguts/> -In Tru64 a file called F<perl.Addrs> will also be silently created, -this file contains the addresses of the basic blocks. Running the -profiled version of Perl will create a new file called "perl.Counts" -which contains the counts for the basic block for that particular -program execution. +=item * L<perlxstut> and L<perlxs> -To display the results you use the F<prof> utility. The exact -incantation depends on your operating system, "prof perl.Counts" in -IRIX, and "prof -pixie -all -L. perl" in Tru64. +A working knowledge of XSUB programming is incredibly useful for core +hacking; XSUBs use techniques drawn from the PP code, the portion of +the guts that actually executes a Perl program. It's a lot gentler to +learn those techniques from simple examples and explanation than from +the core itself. -In IRIX the following prof options are available: +=item * L<perlapi> -=over 4 +The documentation for the Perl API explains what some of the internal +functions do, as well as the many macros used in the source. -=item -h +=item * F<Porting/pumpkin.pod> -Reports the most heavily used lines in descending order of use. -Useful for finding the hotspot lines. +This is a collection of words of wisdom for a Perl porter; some of it +is only useful to the pumpkin holder, but most of it applies to anyone +wanting to go about Perl development. -=item -l +=item * The perl5-porters FAQ -Groups lines by procedure, with procedures sorted in descending order of use. -Within a procedure, lines are listed in source order. -Useful for finding the hotspots of procedures. +This should be available from +http://dev.perl.org/perl5/docs/p5p-faq.html . It contains hints on +reading perl5-porters, information on how perl5-porters works and how +Perl development in general works. =back -In Tru64 the following options are available: - -=over 4 - -=item -p[rocedures] - -Procedures sorted in descending order by the number of cycles executed -in each procedure. Useful for finding the hotspot procedures. -(This is the default option.) - -=item -h[eavy] - -Lines sorted in descending order by the number of cycles executed in -each line. Useful for finding the hotspot lines. - -=item -i[nvocations] - -The called procedures are sorted in descending order by number of calls -made to the procedures. Useful for finding the most used procedures. - -=item -l[ines] - -Grouped by procedure, sorted by cycles executed per procedure. -Useful for finding the hotspots of procedures. +=head1 CPAN TESTERS AND PERL SMOKERS -=item -testcoverage +The CPAN testers ( http://testers.cpan.org/ ) are a group of volunteers +who test CPAN modules on a variety of platforms. -The compiler emitted code for these lines, but the code was unexecuted. - -=item -z[ero] +Perl Smokers ( http://www.nntp.perl.org/group/perl.daily-build/ and +http://www.nntp.perl.org/group/perl.daily-build.reports/ ) +automatically test Perl source releases on platforms with various +configurations. + +Both efforts welcome volunteers. In order to get involved in smoke +testing of the perl itself visit +L<http://search.cpan.org/dist/Test-Smoke/>. In order to start smoke +testing CPAN modules visit +L<http://search.cpan.org/dist/CPANPLUS-YACSmoke/> or +L<http://search.cpan.org/dist/minismokebox/> or +L<http://search.cpan.org/dist/CPAN-Reporter/>. -Unexecuted procedures. +=head1 WHAT NEXT? -=back +If you've read all the documentation in the document and the ones +listed above, you're more than ready to hack on Perl. -For further information, see your system's manual pages for pixie and prof. - -=head2 Miscellaneous tricks +Here's some more recommendations =over 4 =item * -Those debugging perl with the DDD frontend over gdb may find the -following useful: - -You can extend the data conversion shortcuts menu, so for example you -can display an SV's IV value with one click, without doing any typing. -To do that simply edit ~/.ddd/init file and add after: - - ! Display shortcuts. - Ddd*gdbDisplayShortcuts: \ - /t () // Convert to Bin\n\ - /d () // Convert to Dec\n\ - /x () // Convert to Hex\n\ - /o () // Convert to Oct(\n\ - -the following two lines: - - ((XPV*) (())->sv_any )->xpv_pv // 2pvx\n\ - ((XPVIV*) (())->sv_any )->xiv_iv // 2ivx - -so now you can do ivx and pvx lookups or you can plug there the -sv_peek "conversion": - - Perl_sv_peek(my_perl, (SV*)()) // sv_peek - -(The my_perl is for threaded builds.) -Just remember that every line, but the last one, should end with \n\ - -Alternatively edit the init file interactively via: -3rd mouse button -> New Display -> Edit Menu - -Note: you can define up to 20 conversion shortcuts in the gdb -section. - -=item * - -If you see in a debugger a memory area mysteriously full of 0xABABABAB -or 0xEFEFEFEF, you may be seeing the effect of the Poison() macros, -see L<perlclib>. - -=item * - -Under ithreads the optree is read only. If you want to enforce this, to check -for write accesses from buggy code, compile with C<-DPL_OP_SLAB_ALLOC> to -enable the OP slab allocator and C<-DPERL_DEBUG_READONLY_OPS> to enable code -that allocates op memory via C<mmap>, and sets it read-only at run time. -Any write access to an op results in a C<SIGBUS> and abort. - -This code is intended for development only, and may not be portable even to -all Unix variants. Also, it is an 80% solution, in that it isn't able to make -all ops read only. Specifically it - -=over - -=item 1 - -Only sets read-only on all slabs of ops at C<CHECK> time, hence ops allocated -later via C<require> or C<eval> will be re-write - -=item 2 - -Turns an entire slab of ops read-write if the refcount of any op in the slab -needs to be decreased. - -=item 3 - -Turns an entire slab of ops read-write if any op from the slab is freed. - -=back - -It's not possible to turn the slabs to read-only after an action requiring -read-write access, as either can happen during op tree building time, so -there may still be legitimate write access. - -However, as an 80% solution it is still effective, as currently it catches -a write access during the generation of F<Config.pm>, which means that we -can't yet build F<perl> with this enabled. - -=back - - -=head1 CONCLUSION - -We've had a brief look around the Perl source, how to maintain quality -of the source code, an overview of the stages F<perl> goes through -when it's running your code, how to use debuggers to poke at the Perl -guts, and finally how to analyse the execution of Perl. We took a very -simple problem and demonstrated how to solve it fully - with -documentation, regression tests, and finally a patch for submission to -p5p. Finally, we talked about how to use external tools to debug and -test Perl. - -I'd now suggest you read over those references again, and then, as soon -as possible, get your hands dirty. The best way to learn is by doing, -so: - -=over 3 - -=item * - Subscribe to perl5-porters, follow the patches and try and understand them; don't be afraid to ask if there's a portion you're not clear on - who knows, you may unearth a bug in the patch... =item * -Keep up to date with the bleeding edge Perl distributions and get -familiar with the changes. Try and get an idea of what areas people are -working on and the changes they're making. - -=item * - -Do read the README associated with your operating system, e.g. README.aix -on the IBM AIX OS. Don't hesitate to supply patches to that README if -you find anything missing or changed over a new OS release. +Do read the README associated with your operating system, e.g. +README.aix on the IBM AIX OS. Don't hesitate to supply patches to that +README if you find anything missing or changed over a new OS release. =item * Find an area of Perl that seems interesting to you, and see if you can work out how it works. Scan through the source, and step over it in the debugger. Play, poke, investigate, fiddle! You'll probably get to -understand not just your chosen area but a much wider range of F<perl>'s -activity as well, and probably sooner than you'd think. +understand not just your chosen area but a much wider range of +F<perl>'s activity as well, and probably sooner than you'd think. =back -=over 3 - -=item I<The Road goes ever on and on, down from the door where it began.> +=head2 "The Road goes ever on and on, down from the door where it began." -=back - -If you can do these things, you've started on the long road to Perl porting. -Thanks for wanting to help make Perl better - and happy hacking! +If you can do these things, you've started on the long road to Perl +porting. Thanks for wanting to help make Perl better - and happy +hacking! =head2 Metaphoric Quotations If you recognized the quote about the Road above, you're in luck. -Most software projects begin each file with a literal description of each -file's purpose. Perl instead begins each with a literary allusion to that -file's purpose. +Most software projects begin each file with a literal description of +each file's purpose. Perl instead begins each with a literary allusion +to that file's purpose. -Like chapters in many books, all top-level Perl source files (along with a -few others here and there) begin with an epigramic inscription that alludes, -indirectly and metaphorically, to the material you're about to read. +Like chapters in many books, all top-level Perl source files (along +with a few others here and there) begin with an epigrammatic +inscription that alludes, indirectly and metaphorically, to the +material you're about to read. -Quotations are taken from writings of J.R.R Tolkien pertaining to his -Legendarium, almost always from I<The Lord of the Rings>. Chapters and +Quotations are taken from writings of J.R.R. Tolkien pertaining to his +Legendarium, almost always from I<The Lord of the Rings>. Chapters and page numbers are given using the following editions: =over 4 -=item * +=item * -I<The Hobbit>, by J.R.R. Tolkien. The hardcover, 70th-anniversary -edition of 2007 was used, published in the UK by Harper Collins Publishers -and in the US by the Houghton Mifflin Company. +I<The Hobbit>, by J.R.R. Tolkien. The hardcover, 70th-anniversary +edition of 2007 was used, published in the UK by Harper Collins +Publishers and in the US by the Houghton Mifflin Company. =item * -I<The Lord of the Rings>, by J.R.R. Tolkien. The hardcover, -50th-anniversary edition of 2004 was used, published in the UK by Harper -Collins Publishers and in the US by the Houghton Mifflin Company. +I<The Lord of the Rings>, by J.R.R. Tolkien. The hardcover, +50th-anniversary edition of 2004 was used, published in the UK by +Harper Collins Publishers and in the US by the Houghton Mifflin +Company. =item * -I<The Lays of Beleriand>, by J.R.R. Tolkien and published posthumously by his -son and literary executor, C.J.R. Tolkien, being the 3rd of the 12 volumes -in Christopher's mammoth I<History of Middle Earth>. Page numbers derive -from the hardcover edition, first published in 1983 by George Allen & -Unwin; no page numbers changed for the special 3-volume omnibus edition of -2002 or the various trade-paper editions, all again now by Harper Collins -or Houghton Mifflin. +I<The Lays of Beleriand>, by J.R.R. Tolkien and published posthumously +by his son and literary executor, C.J.R. Tolkien, being the 3rd of the +12 volumes in Christopher's mammoth I<History of Middle Earth>. Page +numbers derive from the hardcover edition, first published in 1983 by +George Allen & Unwin; no page numbers changed for the special 3-volume +omnibus edition of 2002 or the various trade-paper editions, all again +now by Harper Collins or Houghton Mifflin. =back -Other JRRT books fair game for quotes would thus include I<The Adventures of -Tom Bombadil>, I<The Silmarillion>, I<Unfinished Tales>, and I<The Tale of -the Children of Hurin>, all but the first posthumously assembled by CJRT. -But I<The Lord of the Rings> itself is perfectly fine and probably best to -quote from, provided you can find a suitable quote there. +Other JRRT books fair game for quotes would thus include I<The +Adventures of Tom Bombadil>, I<The Silmarillion>, I<Unfinished Tales>, +and I<The Tale of the Children of Hurin>, all but the first +posthumously assembled by CJRT. But I<The Lord of the Rings> itself is +perfectly fine and probably best to quote from, provided you can find a +suitable quote there. -So if you were to supply a new, complete, top-level source file to add to -Perl, you should conform to this peculiar practice by yourself selecting an -appropriate quotation from Tolkien, retaining the original spelling and -punctuation and using the same format the rest of the quotes are in. -Indirect and oblique is just fine; remember, it's a metaphor, so being meta -is, after all, what it's for. +So if you were to supply a new, complete, top-level source file to add +to Perl, you should conform to this peculiar practice by yourself +selecting an appropriate quotation from Tolkien, retaining the original +spelling and punctuation and using the same format the rest of the +quotes are in. Indirect and oblique is just fine; remember, it's a +metaphor, so being meta is, after all, what it's for. =head1 AUTHOR -This document was written by Nathan Torkington, and is maintained by -the perl5-porters mailing list. - -=head1 SEE ALSO +This document was originally written by Nathan Torkington, and is +maintained by the perl5-porters mailing list. -L<perlrepository> diff --git a/gnu/usr.bin/perl/pod/perlhacktips.pod b/gnu/usr.bin/perl/pod/perlhacktips.pod new file mode 100644 index 00000000000..bb995f33005 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlhacktips.pod @@ -0,0 +1,1461 @@ + +=encoding utf8 + +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlhacktips.pod + +=head1 NAME + +perlhacktips - Tips for Perl core C code hacking + +=head1 DESCRIPTION + +This document will help you learn the best way to go about hacking on +the Perl core C code. It covers common problems, debugging, profiling, +and more. + +If you haven't read L<perlhack> and L<perlhacktut> yet, you might want +to do that first. + +=head1 COMMON PROBLEMS + +Perl source plays by ANSI C89 rules: no C99 (or C++) extensions. In +some cases we have to take pre-ANSI requirements into consideration. +You don't care about some particular platform having broken Perl? I +hear there is still a strong demand for J2EE programmers. + +=head2 Perl environment problems + +=over 4 + +=item * + +Not compiling with threading + +Compiling with threading (-Duseithreads) completely rewrites the +function prototypes of Perl. You better try your changes with that. +Related to this is the difference between "Perl_-less" and "Perl_-ly" +APIs, for example: + + Perl_sv_setiv(aTHX_ ...); + sv_setiv(...); + +The first one explicitly passes in the context, which is needed for +e.g. threaded builds. The second one does that implicitly; do not get +them mixed. If you are not passing in a aTHX_, you will need to do a +dTHX (or a dVAR) as the first thing in the function. + +See L<perlguts/"How multiple interpreters and concurrency are +supported"> for further discussion about context. + +=item * + +Not compiling with -DDEBUGGING + +The DEBUGGING define exposes more code to the compiler, therefore more +ways for things to go wrong. You should try it. + +=item * + +Introducing (non-read-only) globals + +Do not introduce any modifiable globals, truly global or file static. +They are bad form and complicate multithreading and other forms of +concurrency. The right way is to introduce them as new interpreter +variables, see F<intrpvar.h> (at the very end for binary +compatibility). + +Introducing read-only (const) globals is okay, as long as you verify +with e.g. C<nm libperl.a|egrep -v ' [TURtr] '> (if your C<nm> has +BSD-style output) that the data you added really is read-only. (If it +is, it shouldn't show up in the output of that command.) + +If you want to have static strings, make them constant: + + static const char etc[] = "..."; + +If you want to have arrays of constant strings, note carefully the +right combination of C<const>s: + + static const char * const yippee[] = + {"hi", "ho", "silver"}; + +There is a way to completely hide any modifiable globals (they are all +moved to heap), the compilation setting +C<-DPERL_GLOBAL_STRUCT_PRIVATE>. It is not normally used, but can be +used for testing, read more about it in L<perlguts/"Background and +PERL_IMPLICIT_CONTEXT">. + +=item * + +Not exporting your new function + +Some platforms (Win32, AIX, VMS, OS/2, to name a few) require any +function that is part of the public API (the shared Perl library) to be +explicitly marked as exported. See the discussion about F<embed.pl> in +L<perlguts>. + +=item * + +Exporting your new function + +The new shiny result of either genuine new functionality or your +arduous refactoring is now ready and correctly exported. So what could +possibly go wrong? + +Maybe simply that your function did not need to be exported in the +first place. Perl has a long and not so glorious history of exporting +functions that it should not have. + +If the function is used only inside one source code file, make it +static. See the discussion about F<embed.pl> in L<perlguts>. + +If the function is used across several files, but intended only for +Perl's internal use (and this should be the common case), do not export +it to the public API. See the discussion about F<embed.pl> in +L<perlguts>. + +=back + +=head2 Portability problems + +The following are common causes of compilation and/or execution +failures, not common to Perl as such. The C FAQ is good bedtime +reading. Please test your changes with as many C compilers and +platforms as possible; we will, anyway, and it's nice to save oneself +from public embarrassment. + +If using gcc, you can add the C<-std=c89> option which will hopefully +catch most of these unportabilities. (However it might also catch +incompatibilities in your system's header files.) + +Use the Configure C<-Dgccansipedantic> flag to enable the gcc C<-ansi +-pedantic> flags which enforce stricter ANSI rules. + +If using the C<gcc -Wall> note that not all the possible warnings (like +C<-Wunitialized>) are given unless you also compile with C<-O>. + +Note that if using gcc, starting from Perl 5.9.5 the Perl core source +code files (the ones at the top level of the source code distribution, +but not e.g. the extensions under ext/) are automatically compiled with +as many as possible of the C<-std=c89>, C<-ansi>, C<-pedantic>, and a +selection of C<-W> flags (see cflags.SH). + +Also study L<perlport> carefully to avoid any bad assumptions about the +operating system, filesystems, and so forth. + +You may once in a while try a "make microperl" to see whether we can +still compile Perl with just the bare minimum of interfaces. (See +README.micro.) + +Do not assume an operating system indicates a certain compiler. + +=over 4 + +=item * + +Casting pointers to integers or casting integers to pointers + + void castaway(U8* p) + { + IV i = p; + +or + + void castaway(U8* p) + { + IV i = (IV)p; + +Both are bad, and broken, and unportable. Use the PTR2IV() macro that +does it right. (Likewise, there are PTR2UV(), PTR2NV(), INT2PTR(), and +NUM2PTR().) + +=item * + +Casting between data function pointers and data pointers + +Technically speaking casting between function pointers and data +pointers is unportable and undefined, but practically speaking it seems +to work, but you should use the FPTR2DPTR() and DPTR2FPTR() macros. +Sometimes you can also play games with unions. + +=item * + +Assuming sizeof(int) == sizeof(long) + +There are platforms where longs are 64 bits, and platforms where ints +are 64 bits, and while we are out to shock you, even platforms where +shorts are 64 bits. This is all legal according to the C standard. (In +other words, "long long" is not a portable way to specify 64 bits, and +"long long" is not even guaranteed to be any wider than "long".) + +Instead, use the definitions IV, UV, IVSIZE, I32SIZE, and so forth. +Avoid things like I32 because they are B<not> guaranteed to be +I<exactly> 32 bits, they are I<at least> 32 bits, nor are they +guaranteed to be B<int> or B<long>. If you really explicitly need +64-bit variables, use I64 and U64, but only if guarded by HAS_QUAD. + +=item * + +Assuming one can dereference any type of pointer for any type of data + + char *p = ...; + long pony = *p; /* BAD */ + +Many platforms, quite rightly so, will give you a core dump instead of +a pony if the p happens not to be correctly aligned. + +=item * + +Lvalue casts + + (int)*p = ...; /* BAD */ + +Simply not portable. Get your lvalue to be of the right type, or maybe +use temporary variables, or dirty tricks with unions. + +=item * + +Assume B<anything> about structs (especially the ones you don't +control, like the ones coming from the system headers) + +=over 8 + +=item * + +That a certain field exists in a struct + +=item * + +That no other fields exist besides the ones you know of + +=item * + +That a field is of certain signedness, sizeof, or type + +=item * + +That the fields are in a certain order + +=over 8 + +=item * + +While C guarantees the ordering specified in the struct definition, +between different platforms the definitions might differ + +=back + +=item * + +That the sizeof(struct) or the alignments are the same everywhere + +=over 8 + +=item * + +There might be padding bytes between the fields to align the fields - +the bytes can be anything + +=item * + +Structs are required to be aligned to the maximum alignment required by +the fields - which for native types is for usually equivalent to +sizeof() of the field + +=back + +=back + +=item * + +Assuming the character set is ASCIIish + +Perl can compile and run under EBCDIC platforms. See L<perlebcdic>. +This is transparent for the most part, but because the character sets +differ, you shouldn't use numeric (decimal, octal, nor hex) constants +to refer to characters. You can safely say 'A', but not 0x41. You can +safely say '\n', but not \012. If a character doesn't have a trivial +input form, you can create a #define for it in both C<utfebcdic.h> and +C<utf8.h>, so that it resolves to different values depending on the +character set being used. (There are three different EBCDIC character +sets defined in C<utfebcdic.h>, so it might be best to insert the +#define three times in that file.) + +Also, the range 'A' - 'Z' in ASCII is an unbroken sequence of 26 upper +case alphabetic characters. That is not true in EBCDIC. Nor for 'a' to +'z'. But '0' - '9' is an unbroken range in both systems. Don't assume +anything about other ranges. + +Many of the comments in the existing code ignore the possibility of +EBCDIC, and may be wrong therefore, even if the code works. This is +actually a tribute to the successful transparent insertion of being +able to handle EBCDIC without having to change pre-existing code. + +UTF-8 and UTF-EBCDIC are two different encodings used to represent +Unicode code points as sequences of bytes. Macros with the same names +(but different definitions) in C<utf8.h> and C<utfebcdic.h> are used to +allow the calling code to think that there is only one such encoding. +This is almost always referred to as C<utf8>, but it means the EBCDIC +version as well. Again, comments in the code may well be wrong even if +the code itself is right. For example, the concept of C<invariant +characters> differs between ASCII and EBCDIC. On ASCII platforms, only +characters that do not have the high-order bit set (i.e. whose ordinals +are strict ASCII, 0 - 127) are invariant, and the documentation and +comments in the code may assume that, often referring to something +like, say, C<hibit>. The situation differs and is not so simple on +EBCDIC machines, but as long as the code itself uses the +C<NATIVE_IS_INVARIANT()> macro appropriately, it works, even if the +comments are wrong. + +=item * + +Assuming the character set is just ASCII + +ASCII is a 7 bit encoding, but bytes have 8 bits in them. The 128 extra +characters have different meanings depending on the locale. Absent a +locale, currently these extra characters are generally considered to be +unassigned, and this has presented some problems. This is being changed +starting in 5.12 so that these characters will be considered to be +Latin-1 (ISO-8859-1). + +=item * + +Mixing #define and #ifdef + + #define BURGLE(x) ... \ + #ifdef BURGLE_OLD_STYLE /* BAD */ + ... do it the old way ... \ + #else + ... do it the new way ... \ + #endif + +You cannot portably "stack" cpp directives. For example in the above +you need two separate BURGLE() #defines, one for each #ifdef branch. + +=item * + +Adding non-comment stuff after #endif or #else + + #ifdef SNOSH + ... + #else !SNOSH /* BAD */ + ... + #endif SNOSH /* BAD */ + +The #endif and #else cannot portably have anything non-comment after +them. If you want to document what is going (which is a good idea +especially if the branches are long), use (C) comments: + + #ifdef SNOSH + ... + #else /* !SNOSH */ + ... + #endif /* SNOSH */ + +The gcc option C<-Wendif-labels> warns about the bad variant (by +default on starting from Perl 5.9.4). + +=item * + +Having a comma after the last element of an enum list + + enum color { + CERULEAN, + CHARTREUSE, + CINNABAR, /* BAD */ + }; + +is not portable. Leave out the last comma. + +Also note that whether enums are implicitly morphable to ints varies +between compilers, you might need to (int). + +=item * + +Using //-comments + + // This function bamfoodles the zorklator. /* BAD */ + +That is C99 or C++. Perl is C89. Using the //-comments is silently +allowed by many C compilers but cranking up the ANSI C89 strictness +(which we like to do) causes the compilation to fail. + +=item * + +Mixing declarations and code + + void zorklator() + { + int n = 3; + set_zorkmids(n); /* BAD */ + int q = 4; + +That is C99 or C++. Some C compilers allow that, but you shouldn't. + +The gcc option C<-Wdeclaration-after-statements> scans for such +problems (by default on starting from Perl 5.9.4). + +=item * + +Introducing variables inside for() + + for(int i = ...; ...; ...) { /* BAD */ + +That is C99 or C++. While it would indeed be awfully nice to have that +also in C89, to limit the scope of the loop variable, alas, we cannot. + +=item * + +Mixing signed char pointers with unsigned char pointers + + int foo(char *s) { ... } + ... + unsigned char *t = ...; /* Or U8* t = ... */ + foo(t); /* BAD */ + +While this is legal practice, it is certainly dubious, and downright +fatal in at least one platform: for example VMS cc considers this a +fatal error. One cause for people often making this mistake is that a +"naked char" and therefore dereferencing a "naked char pointer" have an +undefined signedness: it depends on the compiler and the flags of the +compiler and the underlying platform whether the result is signed or +unsigned. For this very same reason using a 'char' as an array index is +bad. + +=item * + +Macros that have string constants and their arguments as substrings of +the string constants + + #define FOO(n) printf("number = %d\n", n) /* BAD */ + FOO(10); + +Pre-ANSI semantics for that was equivalent to + + printf("10umber = %d\10"); + +which is probably not what you were expecting. Unfortunately at least +one reasonably common and modern C compiler does "real backward +compatibility" here, in AIX that is what still happens even though the +rest of the AIX compiler is very happily C89. + +=item * + +Using printf formats for non-basic C types + + IV i = ...; + printf("i = %d\n", i); /* BAD */ + +While this might by accident work in some platform (where IV happens to +be an C<int>), in general it cannot. IV might be something larger. Even +worse the situation is with more specific types (defined by Perl's +configuration step in F<config.h>): + + Uid_t who = ...; + printf("who = %d\n", who); /* BAD */ + +The problem here is that Uid_t might be not only not C<int>-wide but it +might also be unsigned, in which case large uids would be printed as +negative values. + +There is no simple solution to this because of printf()'s limited +intelligence, but for many types the right format is available as with +either 'f' or '_f' suffix, for example: + + IVdf /* IV in decimal */ + UVxf /* UV is hexadecimal */ + + printf("i = %"IVdf"\n", i); /* The IVdf is a string constant. */ + + Uid_t_f /* Uid_t in decimal */ + + printf("who = %"Uid_t_f"\n", who); + +Or you can try casting to a "wide enough" type: + + printf("i = %"IVdf"\n", (IV)something_very_small_and_signed); + +Also remember that the C<%p> format really does require a void pointer: + + U8* p = ...; + printf("p = %p\n", (void*)p); + +The gcc option C<-Wformat> scans for such problems. + +=item * + +Blindly using variadic macros + +gcc has had them for a while with its own syntax, and C99 brought them +with a standardized syntax. Don't use the former, and use the latter +only if the HAS_C99_VARIADIC_MACROS is defined. + +=item * + +Blindly passing va_list + +Not all platforms support passing va_list to further varargs (stdarg) +functions. The right thing to do is to copy the va_list using the +Perl_va_copy() if the NEED_VA_COPY is defined. + +=item * + +Using gcc statement expressions + + val = ({...;...;...}); /* BAD */ + +While a nice extension, it's not portable. The Perl code does +admittedly use them if available to gain some extra speed (essentially +as a funky form of inlining), but you shouldn't. + +=item * + +Binding together several statements in a macro + +Use the macros STMT_START and STMT_END. + + STMT_START { + ... + } STMT_END + +=item * + +Testing for operating systems or versions when should be testing for +features + + #ifdef __FOONIX__ /* BAD */ + foo = quux(); + #endif + +Unless you know with 100% certainty that quux() is only ever available +for the "Foonix" operating system B<and> that is available B<and> +correctly working for B<all> past, present, B<and> future versions of +"Foonix", the above is very wrong. This is more correct (though still +not perfect, because the below is a compile-time check): + + #ifdef HAS_QUUX + foo = quux(); + #endif + +How does the HAS_QUUX become defined where it needs to be? Well, if +Foonix happens to be Unixy enough to be able to run the Configure +script, and Configure has been taught about detecting and testing +quux(), the HAS_QUUX will be correctly defined. In other platforms, the +corresponding configuration step will hopefully do the same. + +In a pinch, if you cannot wait for Configure to be educated, or if you +have a good hunch of where quux() might be available, you can +temporarily try the following: + + #if (defined(__FOONIX__) || defined(__BARNIX__)) + # define HAS_QUUX + #endif + + ... + + #ifdef HAS_QUUX + foo = quux(); + #endif + +But in any case, try to keep the features and operating systems +separate. + +=back + +=head2 Problematic System Interfaces + +=over 4 + +=item * + +malloc(0), realloc(0), calloc(0, 0) are non-portable. To be portable +allocate at least one byte. (In general you should rarely need to work +at this low level, but instead use the various malloc wrappers.) + +=item * + +snprintf() - the return type is unportable. Use my_snprintf() instead. + +=back + +=head2 Security problems + +Last but not least, here are various tips for safer coding. + +=over 4 + +=item * + +Do not use gets() + +Or we will publicly ridicule you. Seriously. + +=item * + +Do not use strcpy() or strcat() or strncpy() or strncat() + +Use my_strlcpy() and my_strlcat() instead: they either use the native +implementation, or Perl's own implementation (borrowed from the public +domain implementation of INN). + +=item * + +Do not use sprintf() or vsprintf() + +If you really want just plain byte strings, use my_snprintf() and +my_vsnprintf() instead, which will try to use snprintf() and +vsnprintf() if those safer APIs are available. If you want something +fancier than a plain byte string, use SVs and Perl_sv_catpvf(). + +=back + +=head1 DEBUGGING + +You can compile a special debugging version of Perl, which allows you +to use the C<-D> option of Perl to tell more about what Perl is doing. +But sometimes there is no alternative than to dive in with a debugger, +either to see the stack trace of a core dump (very useful in a bug +report), or trying to figure out what went wrong before the core dump +happened, or how did we end up having wrong or unexpected results. + +=head2 Poking at Perl + +To really poke around with Perl, you'll probably want to build Perl for +debugging, like this: + + ./Configure -d -D optimize=-g + make + +C<-g> is a flag to the C compiler to have it produce debugging +information which will allow us to step through a running program, and +to see in which C function we are at (without the debugging information +we might see only the numerical addresses of the functions, which is +not very helpful). + +F<Configure> will also turn on the C<DEBUGGING> compilation symbol +which enables all the internal debugging code in Perl. There are a +whole bunch of things you can debug with this: L<perlrun> lists them +all, and the best way to find out about them is to play about with +them. The most useful options are probably + + l Context (loop) stack processing + t Trace execution + o Method and overloading resolution + c String/numeric conversions + +Some of the functionality of the debugging code can be achieved using +XS modules. + + -Dr => use re 'debug' + -Dx => use O 'Debug' + +=head2 Using a source-level debugger + +If the debugging output of C<-D> doesn't help you, it's time to step +through perl's execution with a source-level debugger. + +=over 3 + +=item * + +We'll use C<gdb> for our examples here; the principles will apply to +any debugger (many vendors call their debugger C<dbx>), but check the +manual of the one you're using. + +=back + +To fire up the debugger, type + + gdb ./perl + +Or if you have a core dump: + + gdb ./perl core + +You'll want to do that in your Perl source tree so the debugger can +read the source code. You should see the copyright message, followed by +the prompt. + + (gdb) + +C<help> will get you into the documentation, but here are the most +useful commands: + +=over 3 + +=item * run [args] + +Run the program with the given arguments. + +=item * break function_name + +=item * break source.c:xxx + +Tells the debugger that we'll want to pause execution when we reach +either the named function (but see L<perlguts/Internal Functions>!) or +the given line in the named source file. + +=item * step + +Steps through the program a line at a time. + +=item * next + +Steps through the program a line at a time, without descending into +functions. + +=item * continue + +Run until the next breakpoint. + +=item * finish + +Run until the end of the current function, then stop again. + +=item * 'enter' + +Just pressing Enter will do the most recent operation again - it's a +blessing when stepping through miles of source code. + +=item * print + +Execute the given C code and print its results. B<WARNING>: Perl makes +heavy use of macros, and F<gdb> does not necessarily support macros +(see later L</"gdb macro support">). You'll have to substitute them +yourself, or to invoke cpp on the source code files (see L</"The .i +Targets">) So, for instance, you can't say + + print SvPV_nolen(sv) + +but you have to say + + print Perl_sv_2pv_nolen(sv) + +=back + +You may find it helpful to have a "macro dictionary", which you can +produce by saying C<cpp -dM perl.c | sort>. Even then, F<cpp> won't +recursively apply those macros for you. + +=head2 gdb macro support + +Recent versions of F<gdb> have fairly good macro support, but in order +to use it you'll need to compile perl with macro definitions included +in the debugging information. Using F<gcc> version 3.1, this means +configuring with C<-Doptimize=-g3>. Other compilers might use a +different switch (if they support debugging macros at all). + +=head2 Dumping Perl Data Structures + +One way to get around this macro hell is to use the dumping functions +in F<dump.c>; these work a little like an internal +L<Devel::Peek|Devel::Peek>, but they also cover OPs and other +structures that you can't get at from Perl. Let's take an example. +We'll use the C<$a = $b + $c> we used before, but give it a bit of +context: C<$b = "6XXXX"; $c = 2.3;>. Where's a good place to stop and +poke around? + +What about C<pp_add>, the function we examined earlier to implement the +C<+> operator: + + (gdb) break Perl_pp_add + Breakpoint 1 at 0x46249f: file pp_hot.c, line 309. + +Notice we use C<Perl_pp_add> and not C<pp_add> - see +L<perlguts/Internal Functions>. With the breakpoint in place, we can +run our program: + + (gdb) run -e '$b = "6XXXX"; $c = 2.3; $a = $b + $c' + +Lots of junk will go past as gdb reads in the relevant source files and +libraries, and then: + + Breakpoint 1, Perl_pp_add () at pp_hot.c:309 + 309 dSP; dATARGET; tryAMAGICbin(add,opASSIGN); + (gdb) step + 311 dPOPTOPnnrl_ul; + (gdb) + +We looked at this bit of code before, and we said that +C<dPOPTOPnnrl_ul> arranges for two C<NV>s to be placed into C<left> and +C<right> - let's slightly expand it: + + #define dPOPTOPnnrl_ul NV right = POPn; \ + SV *leftsv = TOPs; \ + NV left = USE_LEFT(leftsv) ? SvNV(leftsv) : 0.0 + +C<POPn> takes the SV from the top of the stack and obtains its NV +either directly (if C<SvNOK> is set) or by calling the C<sv_2nv> +function. C<TOPs> takes the next SV from the top of the stack - yes, +C<POPn> uses C<TOPs> - but doesn't remove it. We then use C<SvNV> to +get the NV from C<leftsv> in the same way as before - yes, C<POPn> uses +C<SvNV>. + +Since we don't have an NV for C<$b>, we'll have to use C<sv_2nv> to +convert it. If we step again, we'll find ourselves there: + + Perl_sv_2nv (sv=0xa0675d0) at sv.c:1669 + 1669 if (!sv) + (gdb) + +We can now use C<Perl_sv_dump> to investigate the SV: + + SV = PV(0xa057cc0) at 0xa0675d0 + REFCNT = 1 + FLAGS = (POK,pPOK) + PV = 0xa06a510 "6XXXX"\0 + CUR = 5 + LEN = 6 + $1 = void + +We know we're going to get C<6> from this, so let's finish the +subroutine: + + (gdb) finish + Run till exit from #0 Perl_sv_2nv (sv=0xa0675d0) at sv.c:1671 + 0x462669 in Perl_pp_add () at pp_hot.c:311 + 311 dPOPTOPnnrl_ul; + +We can also dump out this op: the current op is always stored in +C<PL_op>, and we can dump it with C<Perl_op_dump>. This'll give us +similar output to L<B::Debug|B::Debug>. + + { + 13 TYPE = add ===> 14 + TARG = 1 + FLAGS = (SCALAR,KIDS) + { + TYPE = null ===> (12) + (was rv2sv) + FLAGS = (SCALAR,KIDS) + { + 11 TYPE = gvsv ===> 12 + FLAGS = (SCALAR) + GV = main::b + } + } + +# finish this later # + +=head1 SOURCE CODE STATIC ANALYSIS + +Various tools exist for analysing C source code B<statically>, as +opposed to B<dynamically>, that is, without executing the code. It is +possible to detect resource leaks, undefined behaviour, type +mismatches, portability problems, code paths that would cause illegal +memory accesses, and other similar problems by just parsing the C code +and looking at the resulting graph, what does it tell about the +execution and data flows. As a matter of fact, this is exactly how C +compilers know to give warnings about dubious code. + +=head2 lint, splint + +The good old C code quality inspector, C<lint>, is available in several +platforms, but please be aware that there are several different +implementations of it by different vendors, which means that the flags +are not identical across different platforms. + +There is a lint variant called C<splint> (Secure Programming Lint) +available from http://www.splint.org/ that should compile on any +Unix-like platform. + +There are C<lint> and <splint> targets in Makefile, but you may have to +diddle with the flags (see above). + +=head2 Coverity + +Coverity (http://www.coverity.com/) is a product similar to lint and as +a testbed for their product they periodically check several open source +projects, and they give out accounts to open source developers to the +defect databases. + +=head2 cpd (cut-and-paste detector) + +The cpd tool detects cut-and-paste coding. If one instance of the +cut-and-pasted code changes, all the other spots should probably be +changed, too. Therefore such code should probably be turned into a +subroutine or a macro. + +cpd (http://pmd.sourceforge.net/cpd.html) is part of the pmd project +(http://pmd.sourceforge.net/). pmd was originally written for static +analysis of Java code, but later the cpd part of it was extended to +parse also C and C++. + +Download the pmd-bin-X.Y.zip () from the SourceForge site, extract the +pmd-X.Y.jar from it, and then run that on source code thusly: + + java -cp pmd-X.Y.jar net.sourceforge.pmd.cpd.CPD --minimum-tokens 100 --files /some/where/src --language c > cpd.txt + +You may run into memory limits, in which case you should use the -Xmx +option: + + java -Xmx512M ... + +=head2 gcc warnings + +Though much can be written about the inconsistency and coverage +problems of gcc warnings (like C<-Wall> not meaning "all the warnings", +or some common portability problems not being covered by C<-Wall>, or +C<-ansi> and C<-pedantic> both being a poorly defined collection of +warnings, and so forth), gcc is still a useful tool in keeping our +coding nose clean. + +The C<-Wall> is by default on. + +The C<-ansi> (and its sidekick, C<-pedantic>) would be nice to be on +always, but unfortunately they are not safe on all platforms, they can +for example cause fatal conflicts with the system headers (Solaris +being a prime example). If Configure C<-Dgccansipedantic> is used, the +C<cflags> frontend selects C<-ansi -pedantic> for the platforms where +they are known to be safe. + +Starting from Perl 5.9.4 the following extra flags are added: + +=over 4 + +=item * + +C<-Wendif-labels> + +=item * + +C<-Wextra> + +=item * + +C<-Wdeclaration-after-statement> + +=back + +The following flags would be nice to have but they would first need +their own Augean stablemaster: + +=over 4 + +=item * + +C<-Wpointer-arith> + +=item * + +C<-Wshadow> + +=item * + +C<-Wstrict-prototypes> + +=back + +The C<-Wtraditional> is another example of the annoying tendency of gcc +to bundle a lot of warnings under one switch (it would be impossible to +deploy in practice because it would complain a lot) but it does contain +some warnings that would be beneficial to have available on their own, +such as the warning about string constants inside macros containing the +macro arguments: this behaved differently pre-ANSI than it does in +ANSI, and some C compilers are still in transition, AIX being an +example. + +=head2 Warnings of other C compilers + +Other C compilers (yes, there B<are> other C compilers than gcc) often +have their "strict ANSI" or "strict ANSI with some portability +extensions" modes on, like for example the Sun Workshop has its C<-Xa> +mode on (though implicitly), or the DEC (these days, HP...) has its +C<-std1> mode on. + +=head1 MEMORY DEBUGGERS + +B<NOTE 1>: Running under memory debuggers such as Purify, valgrind, or +Third Degree greatly slows down the execution: seconds become minutes, +minutes become hours. For example as of Perl 5.8.1, the +ext/Encode/t/Unicode.t takes extraordinarily long to complete under +e.g. Purify, Third Degree, and valgrind. Under valgrind it takes more +than six hours, even on a snappy computer. The said test must be doing +something that is quite unfriendly for memory debuggers. If you don't +feel like waiting, that you can simply kill away the perl process. + +B<NOTE 2>: To minimize the number of memory leak false alarms (see +L</PERL_DESTRUCT_LEVEL> for more information), you have to set the +environment variable PERL_DESTRUCT_LEVEL to 2. + +For csh-like shells: + + setenv PERL_DESTRUCT_LEVEL 2 + +For Bourne-type shells: + + PERL_DESTRUCT_LEVEL=2 + export PERL_DESTRUCT_LEVEL + +In Unixy environments you can also use the C<env> command: + + env PERL_DESTRUCT_LEVEL=2 valgrind ./perl -Ilib ... + +B<NOTE 3>: There are known memory leaks when there are compile-time +errors within eval or require, seeing C<S_doeval> in the call stack is +a good sign of these. Fixing these leaks is non-trivial, unfortunately, +but they must be fixed eventually. + +B<NOTE 4>: L<DynaLoader> will not clean up after itself completely +unless Perl is built with the Configure option +C<-Accflags=-DDL_UNLOAD_ALL_AT_EXIT>. + +=head2 Rational Software's Purify + +Purify is a commercial tool that is helpful in identifying memory +overruns, wild pointers, memory leaks and other such badness. Perl must +be compiled in a specific way for optimal testing with Purify. Purify +is available under Windows NT, Solaris, HP-UX, SGI, and Siemens Unix. + +=head3 Purify on Unix + +On Unix, Purify creates a new Perl binary. To get the most benefit out +of Purify, you should create the perl to Purify using: + + sh Configure -Accflags=-DPURIFY -Doptimize='-g' \ + -Uusemymalloc -Dusemultiplicity + +where these arguments mean: + +=over 4 + +=item * -Accflags=-DPURIFY + +Disables Perl's arena memory allocation functions, as well as forcing +use of memory allocation functions derived from the system malloc. + +=item * -Doptimize='-g' + +Adds debugging information so that you see the exact source statements +where the problem occurs. Without this flag, all you will see is the +source filename of where the error occurred. + +=item * -Uusemymalloc + +Disable Perl's malloc so that Purify can more closely monitor +allocations and leaks. Using Perl's malloc will make Purify report most +leaks in the "potential" leaks category. + +=item * -Dusemultiplicity + +Enabling the multiplicity option allows perl to clean up thoroughly +when the interpreter shuts down, which reduces the number of bogus leak +reports from Purify. + +=back + +Once you've compiled a perl suitable for Purify'ing, then you can just: + + make pureperl + +which creates a binary named 'pureperl' that has been Purify'ed. This +binary is used in place of the standard 'perl' binary when you want to +debug Perl memory problems. + +As an example, to show any memory leaks produced during the standard +Perl testset you would create and run the Purify'ed perl as: + + make pureperl + cd t + ../pureperl -I../lib harness + +which would run Perl on test.pl and report any memory problems. + +Purify outputs messages in "Viewer" windows by default. If you don't +have a windowing environment or if you simply want the Purify output to +unobtrusively go to a log file instead of to the interactive window, +use these following options to output to the log file "perl.log": + + setenv PURIFYOPTIONS "-chain-length=25 -windows=no \ + -log-file=perl.log -append-logfile=yes" + +If you plan to use the "Viewer" windows, then you only need this +option: + + setenv PURIFYOPTIONS "-chain-length=25" + +In Bourne-type shells: + + PURIFYOPTIONS="..." + export PURIFYOPTIONS + +or if you have the "env" utility: + + env PURIFYOPTIONS="..." ../pureperl ... + +=head3 Purify on NT + +Purify on Windows NT instruments the Perl binary 'perl.exe' on the fly. + There are several options in the makefile you should change to get the +most use out of Purify: + +=over 4 + +=item * DEFINES + +You should add -DPURIFY to the DEFINES line so the DEFINES line looks +something like: + + DEFINES = -DWIN32 -D_CONSOLE -DNO_STRICT $(CRYPT_FLAG) -DPURIFY=1 + +to disable Perl's arena memory allocation functions, as well as to +force use of memory allocation functions derived from the system +malloc. + +=item * USE_MULTI = define + +Enabling the multiplicity option allows perl to clean up thoroughly +when the interpreter shuts down, which reduces the number of bogus leak +reports from Purify. + +=item * #PERL_MALLOC = define + +Disable Perl's malloc so that Purify can more closely monitor +allocations and leaks. Using Perl's malloc will make Purify report most +leaks in the "potential" leaks category. + +=item * CFG = Debug + +Adds debugging information so that you see the exact source statements +where the problem occurs. Without this flag, all you will see is the +source filename of where the error occurred. + +=back + +As an example, to show any memory leaks produced during the standard +Perl testset you would create and run Purify as: + + cd win32 + make + cd ../t + purify ../perl -I../lib harness + +which would instrument Perl in memory, run Perl on test.pl, then +finally report any memory problems. + +=head2 valgrind + +The excellent valgrind tool can be used to find out both memory leaks +and illegal memory accesses. As of version 3.3.0, Valgrind only +supports Linux on x86, x86-64 and PowerPC and Darwin (OS X) on x86 and +x86-64). The special "test.valgrind" target can be used to run the +tests under valgrind. Found errors and memory leaks are logged in +files named F<testfile.valgrind>. + +Valgrind also provides a cachegrind tool, invoked on perl as: + + VG_OPTS=--tool=cachegrind make test.valgrind + +As system libraries (most notably glibc) are also triggering errors, +valgrind allows to suppress such errors using suppression files. The +default suppression file that comes with valgrind already catches a lot +of them. Some additional suppressions are defined in F<t/perl.supp>. + +To get valgrind and for more information see + + http://valgrind.org/ + +=head1 PROFILING + +Depending on your platform there are various ways of profiling Perl. + +There are two commonly used techniques of profiling executables: +I<statistical time-sampling> and I<basic-block counting>. + +The first method takes periodically samples of the CPU program counter, +and since the program counter can be correlated with the code generated +for functions, we get a statistical view of in which functions the +program is spending its time. The caveats are that very small/fast +functions have lower probability of showing up in the profile, and that +periodically interrupting the program (this is usually done rather +frequently, in the scale of milliseconds) imposes an additional +overhead that may skew the results. The first problem can be alleviated +by running the code for longer (in general this is a good idea for +profiling), the second problem is usually kept in guard by the +profiling tools themselves. + +The second method divides up the generated code into I<basic blocks>. +Basic blocks are sections of code that are entered only in the +beginning and exited only at the end. For example, a conditional jump +starts a basic block. Basic block profiling usually works by +I<instrumenting> the code by adding I<enter basic block #nnnn> +book-keeping code to the generated code. During the execution of the +code the basic block counters are then updated appropriately. The +caveat is that the added extra code can skew the results: again, the +profiling tools usually try to factor their own effects out of the +results. + +=head2 Gprof Profiling + +gprof is a profiling tool available in many Unix platforms, it uses +F<statistical time-sampling>. + +You can build a profiled version of perl called "perl.gprof" by +invoking the make target "perl.gprof" (What is required is that Perl +must be compiled using the C<-pg> flag, you may need to re-Configure). +Running the profiled version of Perl will create an output file called +F<gmon.out> is created which contains the profiling data collected +during the execution. + +The gprof tool can then display the collected data in various ways. +Usually gprof understands the following options: + +=over 4 + +=item * -a + +Suppress statically defined functions from the profile. + +=item * -b + +Suppress the verbose descriptions in the profile. + +=item * -e routine + +Exclude the given routine and its descendants from the profile. + +=item * -f routine + +Display only the given routine and its descendants in the profile. + +=item * -s + +Generate a summary file called F<gmon.sum> which then may be given to +subsequent gprof runs to accumulate data over several runs. + +=item * -z + +Display routines that have zero usage. + +=back + +For more detailed explanation of the available commands and output +formats, see your own local documentation of gprof. + +quick hint: + + $ sh Configure -des -Dusedevel -Doptimize='-pg' && make perl.gprof + $ ./perl.gprof someprog # creates gmon.out in current directory + $ gprof ./perl.gprof > out + $ view out + +=head2 GCC gcov Profiling + +Starting from GCC 3.0 I<basic block profiling> is officially available +for the GNU CC. + +You can build a profiled version of perl called F<perl.gcov> by +invoking the make target "perl.gcov" (what is required that Perl must +be compiled using gcc with the flags C<-fprofile-arcs -ftest-coverage>, +you may need to re-Configure). + +Running the profiled version of Perl will cause profile output to be +generated. For each source file an accompanying ".da" file will be +created. + +To display the results you use the "gcov" utility (which should be +installed if you have gcc 3.0 or newer installed). F<gcov> is run on +source code files, like this + + gcov sv.c + +which will cause F<sv.c.gcov> to be created. The F<.gcov> files contain +the source code annotated with relative frequencies of execution +indicated by "#" markers. + +Useful options of F<gcov> include C<-b> which will summarise the basic +block, branch, and function call coverage, and C<-c> which instead of +relative frequencies will use the actual counts. For more information +on the use of F<gcov> and basic block profiling with gcc, see the +latest GNU CC manual, as of GCC 3.0 see + + http://gcc.gnu.org/onlinedocs/gcc-3.0/gcc.html + +and its section titled "8. gcov: a Test Coverage Program" + + http://gcc.gnu.org/onlinedocs/gcc-3.0/gcc_8.html#SEC132 + +quick hint: + + $ sh Configure -des -Dusedevel -Doptimize='-g' \ + -Accflags='-fprofile-arcs -ftest-coverage' \ + -Aldflags='-fprofile-arcs -ftest-coverage' && make perl.gcov + $ rm -f regexec.c.gcov regexec.gcda + $ ./perl.gcov + $ gcov regexec.c + $ view regexec.c.gcov + +=head1 MISCELLANEOUS TRICKS + +=head2 PERL_DESTRUCT_LEVEL + +If you want to run any of the tests yourself manually using e.g. +valgrind, or the pureperl or perl.third executables, please note that +by default perl B<does not> explicitly cleanup all the memory it has +allocated (such as global memory arenas) but instead lets the exit() of +the whole program "take care" of such allocations, also known as +"global destruction of objects". + +There is a way to tell perl to do complete cleanup: set the environment +variable PERL_DESTRUCT_LEVEL to a non-zero value. The t/TEST wrapper +does set this to 2, and this is what you need to do too, if you don't +want to see the "global leaks": For example, for "third-degreed" Perl: + + env PERL_DESTRUCT_LEVEL=2 ./perl.third -Ilib t/foo/bar.t + +(Note: the mod_perl apache module uses also this environment variable +for its own purposes and extended its semantics. Refer to the mod_perl +documentation for more information. Also, spawned threads do the +equivalent of setting this variable to the value 1.) + +If, at the end of a run you get the message I<N scalars leaked>, you +can recompile with C<-DDEBUG_LEAKING_SCALARS>, which will cause the +addresses of all those leaked SVs to be dumped along with details as to +where each SV was originally allocated. This information is also +displayed by Devel::Peek. Note that the extra details recorded with +each SV increases memory usage, so it shouldn't be used in production +environments. It also converts C<new_SV()> from a macro into a real +function, so you can use your favourite debugger to discover where +those pesky SVs were allocated. + +If you see that you're leaking memory at runtime, but neither valgrind +nor C<-DDEBUG_LEAKING_SCALARS> will find anything, you're probably +leaking SVs that are still reachable and will be properly cleaned up +during destruction of the interpreter. In such cases, using the C<-Dm> +switch can point you to the source of the leak. If the executable was +built with C<-DDEBUG_LEAKING_SCALARS>, C<-Dm> will output SV +allocations in addition to memory allocations. Each SV allocation has a +distinct serial number that will be written on creation and destruction +of the SV. So if you're executing the leaking code in a loop, you need +to look for SVs that are created, but never destroyed between each +cycle. If such an SV is found, set a conditional breakpoint within +C<new_SV()> and make it break only when C<PL_sv_serial> is equal to the +serial number of the leaking SV. Then you will catch the interpreter in +exactly the state where the leaking SV is allocated, which is +sufficient in many cases to find the source of the leak. + +As C<-Dm> is using the PerlIO layer for output, it will by itself +allocate quite a bunch of SVs, which are hidden to avoid recursion. You +can bypass the PerlIO layer if you use the SV logging provided by +C<-DPERL_MEM_LOG> instead. + +=head2 PERL_MEM_LOG + +If compiled with C<-DPERL_MEM_LOG>, both memory and SV allocations go +through logging functions, which is handy for breakpoint setting. + +Unless C<-DPERL_MEM_LOG_NOIMPL> is also compiled, the logging functions +read $ENV{PERL_MEM_LOG} to determine whether to log the event, and if +so how: + + $ENV{PERL_MEM_LOG} =~ /m/ Log all memory ops + $ENV{PERL_MEM_LOG} =~ /s/ Log all SV ops + $ENV{PERL_MEM_LOG} =~ /t/ include timestamp in Log + $ENV{PERL_MEM_LOG} =~ /^(\d+)/ write to FD given (default is 2) + +Memory logging is somewhat similar to C<-Dm> but is independent of +C<-DDEBUGGING>, and at a higher level; all uses of Newx(), Renew(), and +Safefree() are logged with the caller's source code file and line +number (and C function name, if supported by the C compiler). In +contrast, C<-Dm> is directly at the point of C<malloc()>. SV logging is +similar. + +Since the logging doesn't use PerlIO, all SV allocations are logged and +no extra SV allocations are introduced by enabling the logging. If +compiled with C<-DDEBUG_LEAKING_SCALARS>, the serial number for each SV +allocation is also logged. + +=head2 DDD over gdb + +Those debugging perl with the DDD frontend over gdb may find the +following useful: + +You can extend the data conversion shortcuts menu, so for example you +can display an SV's IV value with one click, without doing any typing. +To do that simply edit ~/.ddd/init file and add after: + + ! Display shortcuts. + Ddd*gdbDisplayShortcuts: \ + /t () // Convert to Bin\n\ + /d () // Convert to Dec\n\ + /x () // Convert to Hex\n\ + /o () // Convert to Oct(\n\ + +the following two lines: + + ((XPV*) (())->sv_any )->xpv_pv // 2pvx\n\ + ((XPVIV*) (())->sv_any )->xiv_iv // 2ivx + +so now you can do ivx and pvx lookups or you can plug there the sv_peek +"conversion": + + Perl_sv_peek(my_perl, (SV*)()) // sv_peek + +(The my_perl is for threaded builds.) Just remember that every line, +but the last one, should end with \n\ + +Alternatively edit the init file interactively via: 3rd mouse button -> +New Display -> Edit Menu + +Note: you can define up to 20 conversion shortcuts in the gdb section. + +=head2 Poison + +If you see in a debugger a memory area mysteriously full of 0xABABABAB +or 0xEFEFEFEF, you may be seeing the effect of the Poison() macros, see +L<perlclib>. + +=head2 Read-only optrees + +Under ithreads the optree is read only. If you want to enforce this, to +check for write accesses from buggy code, compile with +C<-DPL_OP_SLAB_ALLOC> to enable the OP slab allocator and +C<-DPERL_DEBUG_READONLY_OPS> to enable code that allocates op memory +via C<mmap>, and sets it read-only at run time. Any write access to an +op results in a C<SIGBUS> and abort. + +This code is intended for development only, and may not be portable +even to all Unix variants. Also, it is an 80% solution, in that it +isn't able to make all ops read only. Specifically it + +=over + +=item * 1 + +Only sets read-only on all slabs of ops at C<CHECK> time, hence ops +allocated later via C<require> or C<eval> will be re-write + +=item * 2 + +Turns an entire slab of ops read-write if the refcount of any op in the +slab needs to be decreased. + +=item * 3 + +Turns an entire slab of ops read-write if any op from the slab is +freed. + +=back + +It's not possible to turn the slabs to read-only after an action +requiring read-write access, as either can happen during op tree +building time, so there may still be legitimate write access. + +However, as an 80% solution it is still effective, as currently it +catches a write access during the generation of F<Config.pm>, which +means that we can't yet build F<perl> with this enabled. + +=head2 The .i Targets + +You can expand the macros in a F<foo.c> file by saying + + make foo.i + +which will expand the macros using cpp. Don't be scared by the results. + +=head1 AUTHOR + +This document was originally written by Nathan Torkington, and is +maintained by the perl5-porters mailing list. diff --git a/gnu/usr.bin/perl/pod/perlhacktut.pod b/gnu/usr.bin/perl/pod/perlhacktut.pod new file mode 100644 index 00000000000..33a9ef23e8d --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlhacktut.pod @@ -0,0 +1,188 @@ +=encoding utf8 + +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlhacktut.pod + +=head1 NAME + +perlhacktut - Walk through the creation of a simple C code patch + +=head1 DESCRIPTION + +This document takes you through a simple patch example. + +If you haven't read L<perlhack> yet, go do that first! You might also +want to read through L<perlsource> too. + +Once you're done here, check out L<perlhacktips> next. + +=head1 EXAMPLE OF A SIMPLE PATCH + +Let's take a simple patch from start to finish. + +Here's something Larry suggested: if a C<U> is the first active format +during a C<pack>, (for example, C<pack "U3C8", @stuff>) then the +resulting string should be treated as UTF-8 encoded. + +If you are working with a git clone of the Perl repository, you will +want to create a branch for your changes. This will make creating a +proper patch much simpler. See the L<perlgit> for details on how to do +this. + +=head2 Writing the patch + +How do we prepare to fix this up? First we locate the code in question +- the C<pack> happens at runtime, so it's going to be in one of the +F<pp> files. Sure enough, C<pp_pack> is in F<pp.c>. Since we're going +to be altering this file, let's copy it to F<pp.c~>. + +[Well, it was in F<pp.c> when this tutorial was written. It has now +been split off with C<pp_unpack> to its own file, F<pp_pack.c>] + +Now let's look over C<pp_pack>: we take a pattern into C<pat>, and then +loop over the pattern, taking each format character in turn into +C<datum_type>. Then for each possible format character, we swallow up +the other arguments in the pattern (a field width, an asterisk, and so +on) and convert the next chunk input into the specified format, adding +it onto the output SV C<cat>. + +How do we know if the C<U> is the first format in the C<pat>? Well, if +we have a pointer to the start of C<pat> then, if we see a C<U> we can +test whether we're still at the start of the string. So, here's where +C<pat> is set up: + + STRLEN fromlen; + register char *pat = SvPVx(*++MARK, fromlen); + register char *patend = pat + fromlen; + register I32 len; + I32 datumtype; + SV *fromstr; + +We'll have another string pointer in there: + + STRLEN fromlen; + register char *pat = SvPVx(*++MARK, fromlen); + register char *patend = pat + fromlen; + + char *patcopy; + register I32 len; + I32 datumtype; + SV *fromstr; + +And just before we start the loop, we'll set C<patcopy> to be the start +of C<pat>: + + items = SP - MARK; + MARK++; + sv_setpvn(cat, "", 0); + + patcopy = pat; + while (pat < patend) { + +Now if we see a C<U> which was at the start of the string, we turn on +the C<UTF8> flag for the output SV, C<cat>: + + + if (datumtype == 'U' && pat==patcopy+1) + + SvUTF8_on(cat); + if (datumtype == '#') { + while (pat < patend && *pat != '\n') + pat++; + +Remember that it has to be C<patcopy+1> because the first character of +the string is the C<U> which has been swallowed into C<datumtype!> + +Oops, we forgot one thing: what if there are spaces at the start of the +pattern? C<pack(" U*", @stuff)> will have C<U> as the first active +character, even though it's not the first thing in the pattern. In this +case, we have to advance C<patcopy> along with C<pat> when we see +spaces: + + if (isSPACE(datumtype)) + continue; + +needs to become + + if (isSPACE(datumtype)) { + patcopy++; + continue; + } + +OK. That's the C part done. Now we must do two additional things before +this patch is ready to go: we've changed the behaviour of Perl, and so +we must document that change. We must also provide some more regression +tests to make sure our patch works and doesn't create a bug somewhere +else along the line. + +=head2 Testing the patch + +The regression tests for each operator live in F<t/op/>, and so we make +a copy of F<t/op/pack.t> to F<t/op/pack.t~>. Now we can add our tests +to the end. First, we'll test that the C<U> does indeed create Unicode +strings. + +t/op/pack.t has a sensible ok() function, but if it didn't we could use +the one from t/test.pl. + + require './test.pl'; + plan( tests => 159 ); + +so instead of this: + + print 'not ' unless "1.20.300.4000" eq sprintf "%vd", + pack("U*",1,20,300,4000); + print "ok $test\n"; $test++; + +we can write the more sensible (see L<Test::More> for a full +explanation of is() and other testing functions). + + is( "1.20.300.4000", sprintf "%vd", pack("U*",1,20,300,4000), + "U* produces Unicode" ); + +Now we'll test that we got that space-at-the-beginning business right: + + is( "1.20.300.4000", sprintf "%vd", pack(" U*",1,20,300,4000), + " with spaces at the beginning" ); + +And finally we'll test that we don't make Unicode strings if C<U> is +B<not> the first active format: + + isnt( v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000), + "U* not first isn't Unicode" ); + +Mustn't forget to change the number of tests which appears at the top, +or else the automated tester will get confused. This will either look +like this: + + print "1..156\n"; + +or this: + + plan( tests => 156 ); + +We now compile up Perl, and run it through the test suite. Our new +tests pass, hooray! + +=head2 Documenting the patch + +Finally, the documentation. The job is never done until the paperwork +is over, so let's describe the change we've just made. The relevant +place is F<pod/perlfunc.pod>; again, we make a copy, and then we'll +insert this text in the description of C<pack>: + + =item * + + If the pattern begins with a C<U>, the resulting string will be treated + as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a string + with an initial C<U0>, and the bytes that follow will be interpreted as + Unicode characters. If you don't want this to happen, you can begin + your pattern with C<C0> (or anything else) to force Perl not to UTF-8 + encode your string, and then follow this with a C<U*> somewhere in your + pattern. + +=head2 Submit + +See L<perlhack> for details on how to submit this patch. + +=head1 AUTHOR + +This document was originally written by Nathan Torkington, and is +maintained by the perl5-porters mailing list. diff --git a/gnu/usr.bin/perl/pod/perlinterp.pod b/gnu/usr.bin/perl/pod/perlinterp.pod new file mode 100644 index 00000000000..c7f21209de5 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlinterp.pod @@ -0,0 +1,742 @@ +=encoding utf8 + +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlinterp.pod + +=head1 NAME + +perlinterp - An overview of the Perl interpreter + +=head1 DESCRIPTION + +This document provides an overview of how the Perl interpreter works at +the level of C code, along with pointers to the relevant C source code +files. + +=head1 ELEMENTS OF THE INTERPRETER + +The work of the interpreter has two main stages: compiling the code +into the internal representation, or bytecode, and then executing it. +L<perlguts/Compiled code> explains exactly how the compilation stage +happens. + +Here is a short breakdown of perl's operation: + +=head2 Startup + +The action begins in F<perlmain.c>. (or F<miniperlmain.c> for miniperl) +This is very high-level code, enough to fit on a single screen, and it +resembles the code found in L<perlembed>; most of the real action takes +place in F<perl.c> + +F<perlmain.c> is generated by C<ExtUtils::Miniperl> from +F<miniperlmain.c> at make time, so you should make perl to follow this +along. + +First, F<perlmain.c> allocates some memory and constructs a Perl +interpreter, along these lines: + + 1 PERL_SYS_INIT3(&argc,&argv,&env); + 2 + 3 if (!PL_do_undump) { + 4 my_perl = perl_alloc(); + 5 if (!my_perl) + 6 exit(1); + 7 perl_construct(my_perl); + 8 PL_perl_destruct_level = 0; + 9 } + +Line 1 is a macro, and its definition is dependent on your operating +system. Line 3 references C<PL_do_undump>, a global variable - all +global variables in Perl start with C<PL_>. This tells you whether the +current running program was created with the C<-u> flag to perl and +then F<undump>, which means it's going to be false in any sane context. + +Line 4 calls a function in F<perl.c> to allocate memory for a Perl +interpreter. It's quite a simple function, and the guts of it looks +like this: + + my_perl = (PerlInterpreter*)PerlMem_malloc(sizeof(PerlInterpreter)); + +Here you see an example of Perl's system abstraction, which we'll see +later: C<PerlMem_malloc> is either your system's C<malloc>, or Perl's +own C<malloc> as defined in F<malloc.c> if you selected that option at +configure time. + +Next, in line 7, we construct the interpreter using perl_construct, +also in F<perl.c>; this sets up all the special variables that Perl +needs, the stacks, and so on. + +Now we pass Perl the command line options, and tell it to go: + + exitstatus = perl_parse(my_perl, xs_init, argc, argv, (char **)NULL); + if (!exitstatus) + perl_run(my_perl); + + exitstatus = perl_destruct(my_perl); + + perl_free(my_perl); + +C<perl_parse> is actually a wrapper around C<S_parse_body>, as defined +in F<perl.c>, which processes the command line options, sets up any +statically linked XS modules, opens the program and calls C<yyparse> to +parse it. + +=head2 Parsing + +The aim of this stage is to take the Perl source, and turn it into an +op tree. We'll see what one of those looks like later. Strictly +speaking, there's three things going on here. + +C<yyparse>, the parser, lives in F<perly.c>, although you're better off +reading the original YACC input in F<perly.y>. (Yes, Virginia, there +B<is> a YACC grammar for Perl!) The job of the parser is to take your +code and "understand" it, splitting it into sentences, deciding which +operands go with which operators and so on. + +The parser is nobly assisted by the lexer, which chunks up your input +into tokens, and decides what type of thing each token is: a variable +name, an operator, a bareword, a subroutine, a core function, and so +on. The main point of entry to the lexer is C<yylex>, and that and its +associated routines can be found in F<toke.c>. Perl isn't much like +other computer languages; it's highly context sensitive at times, it +can be tricky to work out what sort of token something is, or where a +token ends. As such, there's a lot of interplay between the tokeniser +and the parser, which can get pretty frightening if you're not used to +it. + +As the parser understands a Perl program, it builds up a tree of +operations for the interpreter to perform during execution. The +routines which construct and link together the various operations are +to be found in F<op.c>, and will be examined later. + +=head2 Optimization + +Now the parsing stage is complete, and the finished tree represents the +operations that the Perl interpreter needs to perform to execute our +program. Next, Perl does a dry run over the tree looking for +optimisations: constant expressions such as C<3 + 4> will be computed +now, and the optimizer will also see if any multiple operations can be +replaced with a single one. For instance, to fetch the variable +C<$foo>, instead of grabbing the glob C<*foo> and looking at the scalar +component, the optimizer fiddles the op tree to use a function which +directly looks up the scalar in question. The main optimizer is C<peep> +in F<op.c>, and many ops have their own optimizing functions. + +=head2 Running + +Now we're finally ready to go: we have compiled Perl byte code, and all +that's left to do is run it. The actual execution is done by the +C<runops_standard> function in F<run.c>; more specifically, it's done +by these three innocent looking lines: + + while ((PL_op = PL_op->op_ppaddr(aTHX))) { + PERL_ASYNC_CHECK(); + } + +You may be more comfortable with the Perl version of that: + + PERL_ASYNC_CHECK() while $Perl::op = &{$Perl::op->{function}}; + +Well, maybe not. Anyway, each op contains a function pointer, which +stipulates the function which will actually carry out the operation. +This function will return the next op in the sequence - this allows for +things like C<if> which choose the next op dynamically at run time. The +C<PERL_ASYNC_CHECK> makes sure that things like signals interrupt +execution if required. + +The actual functions called are known as PP code, and they're spread +between four files: F<pp_hot.c> contains the "hot" code, which is most +often used and highly optimized, F<pp_sys.c> contains all the +system-specific functions, F<pp_ctl.c> contains the functions which +implement control structures (C<if>, C<while> and the like) and F<pp.c> +contains everything else. These are, if you like, the C code for Perl's +built-in functions and operators. + +Note that each C<pp_> function is expected to return a pointer to the +next op. Calls to perl subs (and eval blocks) are handled within the +same runops loop, and do not consume extra space on the C stack. For +example, C<pp_entersub> and C<pp_entertry> just push a C<CxSUB> or +C<CxEVAL> block struct onto the context stack which contain the address +of the op following the sub call or eval. They then return the first op +of that sub or eval block, and so execution continues of that sub or +block. Later, a C<pp_leavesub> or C<pp_leavetry> op pops the C<CxSUB> +or C<CxEVAL>, retrieves the return op from it, and returns it. + +=head2 Exception handing + +Perl's exception handing (i.e. C<die> etc.) is built on top of the +low-level C<setjmp()>/C<longjmp()> C-library functions. These basically +provide a way to capture the current PC and SP registers and later +restore them; i.e. a C<longjmp()> continues at the point in code where +a previous C<setjmp()> was done, with anything further up on the C +stack being lost. This is why code should always save values using +C<SAVE_FOO> rather than in auto variables. + +The perl core wraps C<setjmp()> etc in the macros C<JMPENV_PUSH> and +C<JMPENV_JUMP>. The basic rule of perl exceptions is that C<exit>, and +C<die> (in the absence of C<eval>) perform a C<JMPENV_JUMP(2)>, while +C<die> within C<eval> does a C<JMPENV_JUMP(3)>. + +At entry points to perl, such as C<perl_parse()>, C<perl_run()> and +C<call_sv(cv, G_EVAL)> each does a C<JMPENV_PUSH>, then enter a runops +loop or whatever, and handle possible exception returns. For a 2 +return, final cleanup is performed, such as popping stacks and calling +C<CHECK> or C<END> blocks. Amongst other things, this is how scope +cleanup still occurs during an C<exit>. + +If a C<die> can find a C<CxEVAL> block on the context stack, then the +stack is popped to that level and the return op in that block is +assigned to C<PL_restartop>; then a C<JMPENV_JUMP(3)> is performed. +This normally passes control back to the guard. In the case of +C<perl_run> and C<call_sv>, a non-null C<PL_restartop> triggers +re-entry to the runops loop. The is the normal way that C<die> or +C<croak> is handled within an C<eval>. + +Sometimes ops are executed within an inner runops loop, such as tie, +sort or overload code. In this case, something like + + sub FETCH { eval { die } } + +would cause a longjmp right back to the guard in C<perl_run>, popping +both runops loops, which is clearly incorrect. One way to avoid this is +for the tie code to do a C<JMPENV_PUSH> before executing C<FETCH> in +the inner runops loop, but for efficiency reasons, perl in fact just +sets a flag, using C<CATCH_SET(TRUE)>. The C<pp_require>, +C<pp_entereval> and C<pp_entertry> ops check this flag, and if true, +they call C<docatch>, which does a C<JMPENV_PUSH> and starts a new +runops level to execute the code, rather than doing it on the current +loop. + +As a further optimisation, on exit from the eval block in the C<FETCH>, +execution of the code following the block is still carried on in the +inner loop. When an exception is raised, C<docatch> compares the +C<JMPENV> level of the C<CxEVAL> with C<PL_top_env> and if they differ, +just re-throws the exception. In this way any inner loops get popped. + +Here's an example. + + 1: eval { tie @a, 'A' }; + 2: sub A::TIEARRAY { + 3: eval { die }; + 4: die; + 5: } + +To run this code, C<perl_run> is called, which does a C<JMPENV_PUSH> +then enters a runops loop. This loop executes the eval and tie ops on +line 1, with the eval pushing a C<CxEVAL> onto the context stack. + +The C<pp_tie> does a C<CATCH_SET(TRUE)>, then starts a second runops +loop to execute the body of C<TIEARRAY>. When it executes the entertry +op on line 3, C<CATCH_GET> is true, so C<pp_entertry> calls C<docatch> +which does a C<JMPENV_PUSH> and starts a third runops loop, which then +executes the die op. At this point the C call stack looks like this: + + Perl_pp_die + Perl_runops # third loop + S_docatch_body + S_docatch + Perl_pp_entertry + Perl_runops # second loop + S_call_body + Perl_call_sv + Perl_pp_tie + Perl_runops # first loop + S_run_body + perl_run + main + +and the context and data stacks, as shown by C<-Dstv>, look like: + + STACK 0: MAIN + CX 0: BLOCK => + CX 1: EVAL => AV() PV("A"\0) + retop=leave + STACK 1: MAGIC + CX 0: SUB => + retop=(null) + CX 1: EVAL => * + retop=nextstate + +The die pops the first C<CxEVAL> off the context stack, sets +C<PL_restartop> from it, does a C<JMPENV_JUMP(3)>, and control returns +to the top C<docatch>. This then starts another third-level runops +level, which executes the nextstate, pushmark and die ops on line 4. At +the point that the second C<pp_die> is called, the C call stack looks +exactly like that above, even though we are no longer within an inner +eval; this is because of the optimization mentioned earlier. However, +the context stack now looks like this, ie with the top CxEVAL popped: + + STACK 0: MAIN + CX 0: BLOCK => + CX 1: EVAL => AV() PV("A"\0) + retop=leave + STACK 1: MAGIC + CX 0: SUB => + retop=(null) + +The die on line 4 pops the context stack back down to the CxEVAL, +leaving it as: + + STACK 0: MAIN + CX 0: BLOCK => + +As usual, C<PL_restartop> is extracted from the C<CxEVAL>, and a +C<JMPENV_JUMP(3)> done, which pops the C stack back to the docatch: + + S_docatch + Perl_pp_entertry + Perl_runops # second loop + S_call_body + Perl_call_sv + Perl_pp_tie + Perl_runops # first loop + S_run_body + perl_run + main + +In this case, because the C<JMPENV> level recorded in the C<CxEVAL> +differs from the current one, C<docatch> just does a C<JMPENV_JUMP(3)> +and the C stack unwinds to: + + perl_run + main + +Because C<PL_restartop> is non-null, C<run_body> starts a new runops +loop and execution continues. + +=head2 INTERNAL VARIABLE TYPES + +You should by now have had a look at L<perlguts>, which tells you about +Perl's internal variable types: SVs, HVs, AVs and the rest. If not, do +that now. + +These variables are used not only to represent Perl-space variables, +but also any constants in the code, as well as some structures +completely internal to Perl. The symbol table, for instance, is an +ordinary Perl hash. Your code is represented by an SV as it's read into +the parser; any program files you call are opened via ordinary Perl +filehandles, and so on. + +The core L<Devel::Peek|Devel::Peek> module lets us examine SVs from a +Perl program. Let's see, for instance, how Perl treats the constant +C<"hello">. + + % perl -MDevel::Peek -e 'Dump("hello")' + 1 SV = PV(0xa041450) at 0xa04ecbc + 2 REFCNT = 1 + 3 FLAGS = (POK,READONLY,pPOK) + 4 PV = 0xa0484e0 "hello"\0 + 5 CUR = 5 + 6 LEN = 6 + +Reading C<Devel::Peek> output takes a bit of practise, so let's go +through it line by line. + +Line 1 tells us we're looking at an SV which lives at C<0xa04ecbc> in +memory. SVs themselves are very simple structures, but they contain a +pointer to a more complex structure. In this case, it's a PV, a +structure which holds a string value, at location C<0xa041450>. Line 2 +is the reference count; there are no other references to this data, so +it's 1. + +Line 3 are the flags for this SV - it's OK to use it as a PV, it's a +read-only SV (because it's a constant) and the data is a PV internally. +Next we've got the contents of the string, starting at location +C<0xa0484e0>. + +Line 5 gives us the current length of the string - note that this does +B<not> include the null terminator. Line 6 is not the length of the +string, but the length of the currently allocated buffer; as the string +grows, Perl automatically extends the available storage via a routine +called C<SvGROW>. + +You can get at any of these quantities from C very easily; just add +C<Sv> to the name of the field shown in the snippet, and you've got a +macro which will return the value: C<SvCUR(sv)> returns the current +length of the string, C<SvREFCOUNT(sv)> returns the reference count, +C<SvPV(sv, len)> returns the string itself with its length, and so on. +More macros to manipulate these properties can be found in L<perlguts>. + +Let's take an example of manipulating a PV, from C<sv_catpvn>, in +F<sv.c> + + 1 void + 2 Perl_sv_catpvn(pTHX_ register SV *sv, register const char *ptr, register STRLEN len) + 3 { + 4 STRLEN tlen; + 5 char *junk; + + 6 junk = SvPV_force(sv, tlen); + 7 SvGROW(sv, tlen + len + 1); + 8 if (ptr == junk) + 9 ptr = SvPVX(sv); + 10 Move(ptr,SvPVX(sv)+tlen,len,char); + 11 SvCUR(sv) += len; + 12 *SvEND(sv) = '\0'; + 13 (void)SvPOK_only_UTF8(sv); /* validate pointer */ + 14 SvTAINT(sv); + 15 } + +This is a function which adds a string, C<ptr>, of length C<len> onto +the end of the PV stored in C<sv>. The first thing we do in line 6 is +make sure that the SV B<has> a valid PV, by calling the C<SvPV_force> +macro to force a PV. As a side effect, C<tlen> gets set to the current +value of the PV, and the PV itself is returned to C<junk>. + +In line 7, we make sure that the SV will have enough room to +accommodate the old string, the new string and the null terminator. If +C<LEN> isn't big enough, C<SvGROW> will reallocate space for us. + +Now, if C<junk> is the same as the string we're trying to add, we can +grab the string directly from the SV; C<SvPVX> is the address of the PV +in the SV. + +Line 10 does the actual catenation: the C<Move> macro moves a chunk of +memory around: we move the string C<ptr> to the end of the PV - that's +the start of the PV plus its current length. We're moving C<len> bytes +of type C<char>. After doing so, we need to tell Perl we've extended +the string, by altering C<CUR> to reflect the new length. C<SvEND> is a +macro which gives us the end of the string, so that needs to be a +C<"\0">. + +Line 13 manipulates the flags; since we've changed the PV, any IV or NV +values will no longer be valid: if we have C<$a=10; $a.="6";> we don't +want to use the old IV of 10. C<SvPOK_only_utf8> is a special +UTF-8-aware version of C<SvPOK_only>, a macro which turns off the IOK +and NOK flags and turns on POK. The final C<SvTAINT> is a macro which +launders tainted data if taint mode is turned on. + +AVs and HVs are more complicated, but SVs are by far the most common +variable type being thrown around. Having seen something of how we +manipulate these, let's go on and look at how the op tree is +constructed. + +=head1 OP TREES + +First, what is the op tree, anyway? The op tree is the parsed +representation of your program, as we saw in our section on parsing, +and it's the sequence of operations that Perl goes through to execute +your program, as we saw in L</Running>. + +An op is a fundamental operation that Perl can perform: all the +built-in functions and operators are ops, and there are a series of ops +which deal with concepts the interpreter needs internally - entering +and leaving a block, ending a statement, fetching a variable, and so +on. + +The op tree is connected in two ways: you can imagine that there are +two "routes" through it, two orders in which you can traverse the tree. +First, parse order reflects how the parser understood the code, and +secondly, execution order tells perl what order to perform the +operations in. + +The easiest way to examine the op tree is to stop Perl after it has +finished parsing, and get it to dump out the tree. This is exactly what +the compiler backends L<B::Terse|B::Terse>, L<B::Concise|B::Concise> +and L<B::Debug|B::Debug> do. + +Let's have a look at how Perl sees C<$a = $b + $c>: + + % perl -MO=Terse -e '$a=$b+$c' + 1 LISTOP (0x8179888) leave + 2 OP (0x81798b0) enter + 3 COP (0x8179850) nextstate + 4 BINOP (0x8179828) sassign + 5 BINOP (0x8179800) add [1] + 6 UNOP (0x81796e0) null [15] + 7 SVOP (0x80fafe0) gvsv GV (0x80fa4cc) *b + 8 UNOP (0x81797e0) null [15] + 9 SVOP (0x8179700) gvsv GV (0x80efeb0) *c + 10 UNOP (0x816b4f0) null [15] + 11 SVOP (0x816dcf0) gvsv GV (0x80fa460) *a + +Let's start in the middle, at line 4. This is a BINOP, a binary +operator, which is at location C<0x8179828>. The specific operator in +question is C<sassign> - scalar assignment - and you can find the code +which implements it in the function C<pp_sassign> in F<pp_hot.c>. As a +binary operator, it has two children: the add operator, providing the +result of C<$b+$c>, is uppermost on line 5, and the left hand side is +on line 10. + +Line 10 is the null op: this does exactly nothing. What is that doing +there? If you see the null op, it's a sign that something has been +optimized away after parsing. As we mentioned in L</Optimization>, the +optimization stage sometimes converts two operations into one, for +example when fetching a scalar variable. When this happens, instead of +rewriting the op tree and cleaning up the dangling pointers, it's +easier just to replace the redundant operation with the null op. +Originally, the tree would have looked like this: + + 10 SVOP (0x816b4f0) rv2sv [15] + 11 SVOP (0x816dcf0) gv GV (0x80fa460) *a + +That is, fetch the C<a> entry from the main symbol table, and then look +at the scalar component of it: C<gvsv> (C<pp_gvsv> into F<pp_hot.c>) +happens to do both these things. + +The right hand side, starting at line 5 is similar to what we've just +seen: we have the C<add> op (C<pp_add> also in F<pp_hot.c>) add +together two C<gvsv>s. + +Now, what's this about? + + 1 LISTOP (0x8179888) leave + 2 OP (0x81798b0) enter + 3 COP (0x8179850) nextstate + +C<enter> and C<leave> are scoping ops, and their job is to perform any +housekeeping every time you enter and leave a block: lexical variables +are tidied up, unreferenced variables are destroyed, and so on. Every +program will have those first three lines: C<leave> is a list, and its +children are all the statements in the block. Statements are delimited +by C<nextstate>, so a block is a collection of C<nextstate> ops, with +the ops to be performed for each statement being the children of +C<nextstate>. C<enter> is a single op which functions as a marker. + +That's how Perl parsed the program, from top to bottom: + + Program + | + Statement + | + = + / \ + / \ + $a + + / \ + $b $c + +However, it's impossible to B<perform> the operations in this order: +you have to find the values of C<$b> and C<$c> before you add them +together, for instance. So, the other thread that runs through the op +tree is the execution order: each op has a field C<op_next> which +points to the next op to be run, so following these pointers tells us +how perl executes the code. We can traverse the tree in this order +using the C<exec> option to C<B::Terse>: + + % perl -MO=Terse,exec -e '$a=$b+$c' + 1 OP (0x8179928) enter + 2 COP (0x81798c8) nextstate + 3 SVOP (0x81796c8) gvsv GV (0x80fa4d4) *b + 4 SVOP (0x8179798) gvsv GV (0x80efeb0) *c + 5 BINOP (0x8179878) add [1] + 6 SVOP (0x816dd38) gvsv GV (0x80fa468) *a + 7 BINOP (0x81798a0) sassign + 8 LISTOP (0x8179900) leave + +This probably makes more sense for a human: enter a block, start a +statement. Get the values of C<$b> and C<$c>, and add them together. +Find C<$a>, and assign one to the other. Then leave. + +The way Perl builds up these op trees in the parsing process can be +unravelled by examining F<perly.y>, the YACC grammar. Let's take the +piece we need to construct the tree for C<$a = $b + $c> + + 1 term : term ASSIGNOP term + 2 { $$ = newASSIGNOP(OPf_STACKED, $1, $2, $3); } + 3 | term ADDOP term + 4 { $$ = newBINOP($2, 0, scalar($1), scalar($3)); } + +If you're not used to reading BNF grammars, this is how it works: +You're fed certain things by the tokeniser, which generally end up in +upper case. Here, C<ADDOP>, is provided when the tokeniser sees C<+> in +your code. C<ASSIGNOP> is provided when C<=> is used for assigning. +These are "terminal symbols", because you can't get any simpler than +them. + +The grammar, lines one and three of the snippet above, tells you how to +build up more complex forms. These complex forms, "non-terminal +symbols" are generally placed in lower case. C<term> here is a +non-terminal symbol, representing a single expression. + +The grammar gives you the following rule: you can make the thing on the +left of the colon if you see all the things on the right in sequence. +This is called a "reduction", and the aim of parsing is to completely +reduce the input. There are several different ways you can perform a +reduction, separated by vertical bars: so, C<term> followed by C<=> +followed by C<term> makes a C<term>, and C<term> followed by C<+> +followed by C<term> can also make a C<term>. + +So, if you see two terms with an C<=> or C<+>, between them, you can +turn them into a single expression. When you do this, you execute the +code in the block on the next line: if you see C<=>, you'll do the code +in line 2. If you see C<+>, you'll do the code in line 4. It's this +code which contributes to the op tree. + + | term ADDOP term + { $$ = newBINOP($2, 0, scalar($1), scalar($3)); } + +What this does is creates a new binary op, and feeds it a number of +variables. The variables refer to the tokens: C<$1> is the first token +in the input, C<$2> the second, and so on - think regular expression +backreferences. C<$$> is the op returned from this reduction. So, we +call C<newBINOP> to create a new binary operator. The first parameter +to C<newBINOP>, a function in F<op.c>, is the op type. It's an addition +operator, so we want the type to be C<ADDOP>. We could specify this +directly, but it's right there as the second token in the input, so we +use C<$2>. The second parameter is the op's flags: 0 means "nothing +special". Then the things to add: the left and right hand side of our +expression, in scalar context. + +=head1 STACKS + +When perl executes something like C<addop>, how does it pass on its +results to the next op? The answer is, through the use of stacks. Perl +has a number of stacks to store things it's currently working on, and +we'll look at the three most important ones here. + +=head2 Argument stack + +Arguments are passed to PP code and returned from PP code using the +argument stack, C<ST>. The typical way to handle arguments is to pop +them off the stack, deal with them how you wish, and then push the +result back onto the stack. This is how, for instance, the cosine +operator works: + + NV value; + value = POPn; + value = Perl_cos(value); + XPUSHn(value); + +We'll see a more tricky example of this when we consider Perl's macros +below. C<POPn> gives you the NV (floating point value) of the top SV on +the stack: the C<$x> in C<cos($x)>. Then we compute the cosine, and +push the result back as an NV. The C<X> in C<XPUSHn> means that the +stack should be extended if necessary - it can't be necessary here, +because we know there's room for one more item on the stack, since +we've just removed one! The C<XPUSH*> macros at least guarantee safety. + +Alternatively, you can fiddle with the stack directly: C<SP> gives you +the first element in your portion of the stack, and C<TOP*> gives you +the top SV/IV/NV/etc. on the stack. So, for instance, to do unary +negation of an integer: + + SETi(-TOPi); + +Just set the integer value of the top stack entry to its negation. + +Argument stack manipulation in the core is exactly the same as it is in +XSUBs - see L<perlxstut>, L<perlxs> and L<perlguts> for a longer +description of the macros used in stack manipulation. + +=head2 Mark stack + +I say "your portion of the stack" above because PP code doesn't +necessarily get the whole stack to itself: if your function calls +another function, you'll only want to expose the arguments aimed for +the called function, and not (necessarily) let it get at your own data. +The way we do this is to have a "virtual" bottom-of-stack, exposed to +each function. The mark stack keeps bookmarks to locations in the +argument stack usable by each function. For instance, when dealing with +a tied variable, (internally, something with "P" magic) Perl has to +call methods for accesses to the tied variables. However, we need to +separate the arguments exposed to the method to the argument exposed to +the original function - the store or fetch or whatever it may be. +Here's roughly how the tied C<push> is implemented; see C<av_push> in +F<av.c>: + + 1 PUSHMARK(SP); + 2 EXTEND(SP,2); + 3 PUSHs(SvTIED_obj((SV*)av, mg)); + 4 PUSHs(val); + 5 PUTBACK; + 6 ENTER; + 7 call_method("PUSH", G_SCALAR|G_DISCARD); + 8 LEAVE; + +Let's examine the whole implementation, for practice: + + 1 PUSHMARK(SP); + +Push the current state of the stack pointer onto the mark stack. This +is so that when we've finished adding items to the argument stack, Perl +knows how many things we've added recently. + + 2 EXTEND(SP,2); + 3 PUSHs(SvTIED_obj((SV*)av, mg)); + 4 PUSHs(val); + +We're going to add two more items onto the argument stack: when you +have a tied array, the C<PUSH> subroutine receives the object and the +value to be pushed, and that's exactly what we have here - the tied +object, retrieved with C<SvTIED_obj>, and the value, the SV C<val>. + + 5 PUTBACK; + +Next we tell Perl to update the global stack pointer from our internal +variable: C<dSP> only gave us a local copy, not a reference to the +global. + + 6 ENTER; + 7 call_method("PUSH", G_SCALAR|G_DISCARD); + 8 LEAVE; + +C<ENTER> and C<LEAVE> localise a block of code - they make sure that +all variables are tidied up, everything that has been localised gets +its previous value returned, and so on. Think of them as the C<{> and +C<}> of a Perl block. + +To actually do the magic method call, we have to call a subroutine in +Perl space: C<call_method> takes care of that, and it's described in +L<perlcall>. We call the C<PUSH> method in scalar context, and we're +going to discard its return value. The call_method() function removes +the top element of the mark stack, so there is nothing for the caller +to clean up. + +=head2 Save stack + +C doesn't have a concept of local scope, so perl provides one. We've +seen that C<ENTER> and C<LEAVE> are used as scoping braces; the save +stack implements the C equivalent of, for example: + + { + local $foo = 42; + ... + } + +See L<perlguts/"Localizing changes"> for how to use the save stack. + +=head1 MILLIONS OF MACROS + +One thing you'll notice about the Perl source is that it's full of +macros. Some have called the pervasive use of macros the hardest thing +to understand, others find it adds to clarity. Let's take an example, +the code which implements the addition operator: + + 1 PP(pp_add) + 2 { + 3 dSP; dATARGET; tryAMAGICbin(add,opASSIGN); + 4 { + 5 dPOPTOPnnrl_ul; + 6 SETn( left + right ); + 7 RETURN; + 8 } + 9 } + +Every line here (apart from the braces, of course) contains a macro. +The first line sets up the function declaration as Perl expects for PP +code; line 3 sets up variable declarations for the argument stack and +the target, the return value of the operation. Finally, it tries to see +if the addition operation is overloaded; if so, the appropriate +subroutine is called. + +Line 5 is another variable declaration - all variable declarations +start with C<d> - which pops from the top of the argument stack two NVs +(hence C<nn>) and puts them into the variables C<right> and C<left>, +hence the C<rl>. These are the two operands to the addition operator. +Next, we call C<SETn> to set the NV of the return value to the result +of adding the two values. This done, we return - the C<RETURN> macro +makes sure that our return value is properly handled, and we pass the +next operator to run back to the main run loop. + +Most of these macros are explained in L<perlapi>, and some of the more +important ones are explained in L<perlxs> as well. Pay special +attention to L<perlguts/Background and PERL_IMPLICIT_CONTEXT> for +information on the C<[pad]THX_?> macros. + +=head1 FURTHER READING + +For more information on the Perl internals, please see the documents +listed at L<perl/Internals and C Language Interface>. diff --git a/gnu/usr.bin/perl/pod/perlintro.pod b/gnu/usr.bin/perl/pod/perlintro.pod index c47274bc646..afce360a2ac 100644 --- a/gnu/usr.bin/perl/pod/perlintro.pod +++ b/gnu/usr.bin/perl/pod/perlintro.pod @@ -22,6 +22,25 @@ Throughout this document you'll see references to other parts of the Perl documentation. You can read that documentation using the C<perldoc> command or whatever method you're using to read this document. +Throughout Perl's documentation, you'll find numerous examples intended +to help explain the discussed features. Please keep in mind that many +of them are code fragments rather than complete programs. + +These examples often reflect the style and preference of the author of +that piece of the documentation, and may be briefer than a corresponding +line of code in a real program. Except where otherwise noted, you +should assume that C<use strict> and C<use warnings> statements +appear earlier in the "program", and that any variables used have +already been declared, even if those declarations have been omitted +to make the example easier to read. + +Do note that the examples have been written by many different authors over +a period of several decades. Styles and techniques will therefore differ, +although some effort has been made to not vary styles too widely in the +same sections. Do not consider one style to be better than others - "There's +More Than One Way To Do It" is one of Perl's mottos. After all, in your +journey as a programmer, you are likely to encounter different styles. + =head2 What is Perl? Perl is a general-purpose programming language originally developed for @@ -54,7 +73,7 @@ Alternatively, put this as the first line of your script: ... and run the script as C</path/to/script.pl>. Of course, it'll need to be executable first, so C<chmod 755 script.pl> (under Unix). -(This start line assumes you have the B<env> program. You can also put +(This start line assumes you have the B<env> program. You can also put directly the path to your perl executable, like in C<#!/usr/bin/perl>). For more information, including instructions for other platforms such as @@ -62,7 +81,7 @@ Windows and Mac OS, read L<perlrun>. =head2 Safety net -Perl by default is very forgiving. In order to make it more robust +Perl by default is very forgiving. In order to make it more robust it is recommended to start every program with the following lines: #!/usr/bin/perl @@ -70,7 +89,7 @@ it is recommended to start every program with the following lines: use warnings; The two additional lines request from perl to catch various common -problems in your code. They check different things so you need both. A +problems in your code. They check different things so you need both. A potential problem caught by C<use strict;> will cause your code to stop immediately when it is encountered, while C<use warnings;> will merely give a warning (like the command-line switch B<-w>) and let your code run. @@ -144,7 +163,7 @@ A scalar represents a single value: Scalar values can be strings, integers or floating point numbers, and Perl will automatically convert between them as required. There is no need to pre-declare your variable types, but you have to declare them using -the C<my> keyword the first time you use them. (This is one of the +the C<my> keyword the first time you use them. (This is one of the requirements of C<use strict;>.) Scalar values can be used in various ways: @@ -248,9 +267,9 @@ More complex data types can be constructed using references, which allow you to build lists and hashes within lists and hashes. A reference is a scalar value and can refer to any other Perl data -type. So by storing a reference as the value of an array or hash +type. So by storing a reference as the value of an array or hash element, you can easily create lists and hashes within lists and -hashes. The following example shows a 2 level hash of hash +hashes. The following example shows a 2 level hash of hash structure using anonymous hash references. my $variables = { @@ -309,7 +328,7 @@ running the program. Using C<strict> is highly recommended. Perl has most of the usual conditional and looping constructs. As of Perl 5.10, it even has a case/switch statement (spelled C<given>/C<when>). See -L<perlsyn/"Switch statements"> for more details. +L<perlsyn/"Switch Statements"> for more details. The conditions can be any Perl expression. See the list of operators in the next section for information on comparison and boolean logic operators, @@ -388,6 +407,9 @@ the more friendly list scanning C<foreach> loop. print "The value of $key is $hash{$key}\n"; } +The C<foreach> keyword is actually a synonym for the C<for> +keyword. See C<L<perlsyn/"Foreach Loops">>. + =back For more detail on looping constructs (and some that weren't mentioned in @@ -442,7 +464,7 @@ before 99). ! not (C<and>, C<or> and C<not> aren't just in the above table as descriptions -of the operators. They're also supported as operators in their own +of the operators. They're also supported as operators in their own right. They're more readable than the C-style operators, but have different precedence to C<&&> and friends. Check L<perlop> for more detail.) @@ -480,8 +502,8 @@ the list: my $line = <$in>; my @lines = <$in>; -Reading in the whole file at one time is called slurping. It can -be useful but it may be a memory hog. Most text file processing +Reading in the whole file at one time is called slurping. It can +be useful but it may be a memory hog. Most text file processing can be done a line at a time with Perl's looping constructs. The C<< <> >> operator is most often seen in a C<while> loop: @@ -641,7 +663,7 @@ For more information on writing subroutines, see L<perlsub>. OO Perl is relatively simple and is implemented using references which know what sort of object they are based on Perl's concept of packages. However, OO Perl is largely beyond the scope of this document. -Read L<perlboot>, L<perltoot>, L<perltooc> and L<perlobj>. +Read L<perlootut> and L<perlobj>. As a beginning Perl programmer, your most common use of OO Perl will be in using third-party modules, which are documented below. diff --git a/gnu/usr.bin/perl/pod/perliol.pod b/gnu/usr.bin/perl/pod/perliol.pod index e81484772a7..767fabdd7ff 100644 --- a/gnu/usr.bin/perl/pod/perliol.pod +++ b/gnu/usr.bin/perl/pod/perliol.pod @@ -526,11 +526,12 @@ passed to C<open>, otherwise it will be 1 if for example C<PerlIO_open> was called. In simple cases SvPV_nolen(*args) is the pathname to open. -Having said all that translation-only layers do not need to provide -C<Open()> at all, but rather leave the opening to a lower level layer -and wait to be "pushed". If a layer does provide C<Open()> it should -normally call the C<Open()> method of next layer down (if any) and -then push itself on top if that succeeds. +If a layer provides C<Open()> it should normally call the C<Open()> +method of next layer down (if any) and then push itself on top if that +succeeds. C<PerlIOBase_open> is provided to do exactly that, so in +most cases you don't have to write your own C<Open()> method. If this +method is not defined, other layers may have difficulty pushing +themselves on top of it during open. If C<PerlIO_push> was performed and open has failed, it must C<PerlIO_pop> itself, since if it's not, the layer won't be removed @@ -922,7 +923,7 @@ which do not need to do anything special for a particular method. =head2 Extension Layers -Layers can made available by extension modules. When an unknown layer +Layers can be made available by extension modules. When an unknown layer is encountered the PerlIO code will perform the equivalent of : use PerlIO 'layer'; diff --git a/gnu/usr.bin/perl/pod/perllexwarn.pod b/gnu/usr.bin/perl/pod/perllexwarn.pod index 835914e4869..e63135915b9 100644 --- a/gnu/usr.bin/perl/pod/perllexwarn.pod +++ b/gnu/usr.bin/perl/pod/perllexwarn.pod @@ -158,7 +158,7 @@ Does the exact opposite to the B<-W> flag, i.e. it disables all warnings. =head2 Backward Compatibility -If you are used with working with a version of Perl prior to the +If you are used to working with a version of Perl prior to the introduction of lexically scoped warnings, or have code that uses both lexical warnings and C<$^W>, this section will describe how they interact. @@ -169,7 +169,7 @@ How Lexical Warnings interact with B<-w>/C<$^W>: =item 1. If none of the three command line flags (B<-w>, B<-W> or B<-X>) that -control warnings is used and neither C<$^W> or the C<warnings> pragma +control warnings is used and neither C<$^W> nor the C<warnings> pragma are used, then default warnings will be enabled and optional warnings disabled. This means that legacy code that doesn't attempt to control the warnings @@ -304,7 +304,13 @@ The current hierarchy is: | +- untie | - +- utf8 + +- utf8----------+ + | | + | +- surrogate + | | + | +- non_unicode + | | + | +- nonchar | +- void @@ -520,6 +526,16 @@ a warning. Notice also that the warning is reported at the line where the object is first used. +When registering new categories of warning, you can supply more names to +warnings::register like this: + + package MyModule; + use warnings::register qw(format precision); + + ... + + warnings::warnif('MyModule::format', '...'); + =head1 SEE ALSO L<warnings>, L<perldiag>. diff --git a/gnu/usr.bin/perl/pod/perlmodlib.PL b/gnu/usr.bin/perl/pod/perlmodlib.PL index dc2faf2ffbd..36990f96c41 100644 --- a/gnu/usr.bin/perl/pod/perlmodlib.PL +++ b/gnu/usr.bin/perl/pod/perlmodlib.PL @@ -80,14 +80,16 @@ for my $filename (@files) { # Much easier to special case it like this than special case the depending on # and parsing lib/Config.pod, or special case opening configpm and finding its # =head1 (which is not found with the $/="" above) -push @mod, <<'CONFIG'; -=item Config +push @mod, "=item Config\n\nAccess Perl configuration information\n\n"; -Access Perl configuration information -CONFIG +# The intent of using =cut as the heredoc terminator is to make the whole file +# parse as (reasonably) sane Pod as-is to anything that attempts to +# brute-force treat it as such. The content is already useful - this just +# makes it tidier, by stopping anything doing this mistaking the rest of the +# Perl code for Pod. eg http://search.cpan.org/dist/perl/pod/perlmodlib.PL -print OUT <<'EOF'; +print OUT <<'=cut'; =for maintainers Generated by perlmodlib.PL -- DO NOT EDIT! @@ -137,11 +139,12 @@ The following pragmas are defined (and have their own documentation). =over 12 -EOF +=cut print OUT $_ for (sort @pragma); -print OUT <<EOF; +print OUT <<'=cut'; + =back =head2 Standard Modules @@ -156,11 +159,12 @@ don't have the gdbm library. =over 12 -EOF +=cut print OUT $_ for (sort @mod); -print OUT <<'EOF'; +print OUT <<'=cut'; + =back To find out I<all> modules installed on your system, including @@ -318,17 +322,20 @@ European and the South American sites. Registered CPAN sites +=for maintainers +Generated by Porting/make_modlib_cpan.pl + =head2 Africa =over 4 =item South Africa - http://cpan.mirror.ac.za/ - ftp://cpan.mirror.ac.za/ - http://mirror.is.co.za/pub/cpan/ - ftp://ftp.is.co.za/pub/cpan/ - ftp://ftp.saix.net/pub/CPAN/ + http://cpan.mirror.ac.za/ + ftp://cpan.mirror.ac.za/ + http://mirror.is.co.za/pub/cpan/ + ftp://ftp.is.co.za/pub/cpan/ + ftp://ftp.saix.net/pub/CPAN/ =back @@ -336,97 +343,92 @@ Registered CPAN sites =over 4 +=item China + + http://cpan.wenzk.com/ + =item Hong Kong - http://ftp.cuhk.edu.hk/pub/packages/perl/CPAN/ - ftp://ftp.cuhk.edu.hk/pub/packages/perl/CPAN/ - http://mirrors.geoexpat.com/cpan/ + http://ftp.cuhk.edu.hk/pub/packages/perl/CPAN/ + ftp://ftp.cuhk.edu.hk/pub/packages/perl/CPAN/ + http://mirrors.geoexpat.com/cpan/ =item India - http://perlmirror.indialinks.com/ + http://perlmirror.indialinks.com/ =item Indonesia - http://cpan.biz.net.id/ - http://komo.vlsm.org/CPAN/ - ftp://komo.vlsm.org/CPAN/ - http://cpan.pesat.net.id/ - http://mirror.unej.ac.id/cpan/ - ftp://mirror.unej.ac.id/cpan/ + http://cpan.biz.net.id/ + http://komo.vlsm.org/CPAN/ + ftp://komo.vlsm.org/CPAN/ + http://cpan.cermin.lipi.go.id/ + ftp://cermin.lipi.go.id/pub/CPAN/ + http://cpan.pesat.net.id/ =item Japan - ftp://ftp.u-aizu.ac.jp/pub/CPAN - ftp://ftp.kddilabs.jp/CPAN/ - http://ftp.nara.wide.ad.jp/pub/CPAN/ - ftp://ftp.nara.wide.ad.jp/pub/CPAN/ - http://ftp.jaist.ac.jp/pub/CPAN/ - ftp://ftp.jaist.ac.jp/pub/CPAN/ - ftp://ftp.dti.ad.jp/pub/lang/CPAN/ - ftp://ftp.ring.gr.jp/pub/lang/perl/CPAN/ - http://ftp.riken.jp/lang/CPAN/ - ftp://ftp.riken.jp/lang/CPAN/ - http://ftp.yz.yamagata-u.ac.jp/pub/lang/cpan/ - ftp://ftp.yz.yamagata-u.ac.jp/pub/lang/cpan/ - -=item Kazakhstan - - http://mirror.linuxiso.kz/CPAN/ + ftp://ftp.u-aizu.ac.jp/pub/CPAN + ftp://ftp.kddilabs.jp/CPAN/ + http://ftp.nara.wide.ad.jp/pub/CPAN/ + ftp://ftp.nara.wide.ad.jp/pub/CPAN/ + http://ftp.jaist.ac.jp/pub/CPAN/ + ftp://ftp.jaist.ac.jp/pub/CPAN/ + ftp://ftp.dti.ad.jp/pub/lang/CPAN/ + ftp://ftp.ring.gr.jp/pub/lang/perl/CPAN/ + http://ftp.riken.jp/lang/CPAN/ + ftp://ftp.riken.jp/lang/CPAN/ + http://ftp.yz.yamagata-u.ac.jp/pub/lang/cpan/ + ftp://ftp.yz.yamagata-u.ac.jp/pub/lang/cpan/ =item Republic of Korea - http://ftp.kaist.ac.kr/pub/CPAN - ftp://ftp.kaist.ac.kr/pub/CPAN - http://cpan.mirror.cdnetworks.com/ - ftp://cpan.mirror.cdnetworks.com/CPAN/ - http://cpan.sarang.net/ - ftp://cpan.sarang.net/CPAN/ + http://ftp.kaist.ac.kr/pub/CPAN + ftp://ftp.kaist.ac.kr/pub/CPAN + http://cpan.mirror.cdnetworks.com/ + ftp://cpan.mirror.cdnetworks.com/CPAN/ + http://cpan.sarang.net/ + ftp://cpan.sarang.net/CPAN/ =item Russia - http://cpan.tomsk.ru/ - ftp://cpan.tomsk.ru/ + http://cpan.tomsk.ru/ + ftp://cpan.tomsk.ru/ =item Singapore - http://mirror.averse.net/pub/CPAN - ftp://mirror.averse.net/pub/CPAN - http://cpan.mirror.choon.net/ - http://cpan.oss.eznetsols.org - ftp://ftp.oss.eznetsols.org/cpan + http://mirror.averse.net/pub/CPAN + ftp://mirror.averse.net/pub/CPAN + http://cpan.mirror.choon.net/ + http://cpan.oss.eznetsols.org + ftp://ftp.oss.eznetsols.org/cpan =item Taiwan - http://ftp.cse.yzu.edu.tw/pub/CPAN/ - ftp://ftp.cse.yzu.edu.tw/pub/CPAN/ - http://cpan.nctu.edu.tw/ - ftp://cpan.nctu.edu.tw/ - ftp://ftp.ncu.edu.tw/CPAN/ - http://cpan.cdpa.nsysu.edu.tw/ - ftp://cpan.cdpa.nsysu.edu.tw/Unix/Lang/CPAN/ - http://cpan.stu.edu.tw - ftp://ftp.stu.edu.tw/CPAN - http://ftp.stu.edu.tw/CPAN - ftp://ftp.stu.edu.tw/pub/CPAN - http://cpan.cs.pu.edu.tw/ - ftp://cpan.cs.pu.edu.tw/pub/CPAN + http://ftp.cse.yzu.edu.tw/pub/CPAN/ + ftp://ftp.cse.yzu.edu.tw/pub/CPAN/ + http://cpan.nctu.edu.tw/ + ftp://cpan.nctu.edu.tw/ + ftp://ftp.ncu.edu.tw/CPAN/ + http://cpan.cdpa.nsysu.edu.tw/ + ftp://cpan.cdpa.nsysu.edu.tw/Unix/Lang/CPAN/ + http://cpan.stu.edu.tw + ftp://ftp.stu.edu.tw/CPAN + http://ftp.stu.edu.tw/CPAN + ftp://ftp.stu.edu.tw/pub/CPAN + http://cpan.cs.pu.edu.tw/ + ftp://cpan.cs.pu.edu.tw/pub/CPAN =item Thailand - http://mirrors.issp.co.th/cpan/ - ftp://mirrors.issp.co.th/cpan/ + http://mirrors.issp.co.th/cpan/ + ftp://mirrors.issp.co.th/cpan/ + http://mirror.yourconnect.com/CPAN/ + ftp://mirror.yourconnect.com/CPAN/ =item Turkey - http://cpan.gazi.edu.tr/ - http://cpan.ulak.net.tr - ftp://ftp.ulak.net.tr/pub/CPAN - -=item Viet Nam - - http://mirror-fpt-telecom.fpt.net/cpan/ - ftp://mirror-fpt-telecom.fpt.net/cpan/ + http://cpan.gazi.edu.tr/ =back @@ -436,8 +438,8 @@ Registered CPAN sites =item Costa Rica - http://mirrors.ucr.ac.cr/CPAN/ - ftp://mirrors.ucr.ac.cr/CPAN/ + http://mirrors.ucr.ac.cr/CPAN/ + ftp://mirrors.ucr.ac.cr/CPAN/ =back @@ -447,284 +449,274 @@ Registered CPAN sites =item Austria - http://cpan.inode.at/ - ftp://cpan.inode.at - http://gd.tuwien.ac.at/languages/perl/CPAN/ - ftp://gd.tuwien.ac.at/pub/CPAN/ + http://cpan.inode.at/ + ftp://cpan.inode.at + http://gd.tuwien.ac.at/languages/perl/CPAN/ + ftp://gd.tuwien.ac.at/pub/CPAN/ =item Belgium - http://ftp.belnet.be/mirror/ftp.cpan.org/ - ftp://ftp.belnet.be/mirror/ftp.cpan.org/ - http://ftp.easynet.be/pub/CPAN/ - http://cpan.weepee.org/ - http://cpan.fluoline.net/ + http://ftp.belnet.be/mirror/ftp.cpan.org/ + ftp://ftp.belnet.be/mirror/ftp.cpan.org/ + http://ftp.easynet.be/pub/CPAN/ + http://cpan.weepee.org/ =item Bosnia and Herzegovina - http://cpan.blic.net/ + http://cpan.blic.net/ =item Bulgaria - http://cpan.cbox.biz/ - ftp://cpan.cbox.biz/cpan/ - http://cpan.digsys.bg/ - ftp://ftp.digsys.bg/pub/CPAN + http://cpan.cbox.biz/ + ftp://cpan.cbox.biz/cpan/ + http://cpan.digsys.bg/ + ftp://ftp.digsys.bg/pub/CPAN =item Croatia - http://ftp.carnet.hr/pub/CPAN/ - ftp://ftp.carnet.hr/pub/CPAN/ + http://ftp.carnet.hr/pub/CPAN/ + ftp://ftp.carnet.hr/pub/CPAN/ =item Czech Republic - ftp://ftp.fi.muni.cz/pub/CPAN/ - http://archive.cpan.cz/ + ftp://ftp.fi.muni.cz/pub/CPAN/ + http://archive.cpan.cz/ =item Denmark - http://mirrors.dotsrc.org/cpan - ftp://mirrors.dotsrc.org/cpan/ - http://www.cpan.dk/ - http://mirror.uni-c.dk/pub/CPAN/ + http://mirrors.dotsrc.org/cpan + ftp://mirrors.dotsrc.org/cpan/ + http://www.cpan.dk/ + http://mirror.uni-c.dk/pub/CPAN/ =item Finland - ftp://ftp.funet.fi/pub/languages/perl/CPAN/ - http://mirror.eunet.fi/CPAN + ftp://ftp.funet.fi/pub/languages/perl/CPAN/ + http://mirror.eunet.fi/CPAN =item France - http://cpan.enstimac.fr/ - ftp://ftp.inria.fr/pub/CPAN/ - http://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ - ftp://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ - ftp://ftp.lip6.fr/pub/perl/CPAN/ - http://mir2.ovh.net/ftp.cpan.org - ftp://mir1.ovh.net/ftp.cpan.org - http://cpan.miroir-francais.fr/ - ftp://miroir-francais.fr/pub/cpan/ - ftp://ftp.oleane.net/pub/CPAN/ - http://ftp.crihan.fr/mirrors/ftp.cpan.org/ - ftp://ftp.crihan.fr/mirrors/ftp.cpan.org/ - http://ftp.u-strasbg.fr/CPAN - ftp://ftp.u-strasbg.fr/CPAN - http://cpan.cict.fr/ - ftp://cpan.cict.fr/pub/CPAN/ + http://cpan.enstimac.fr/ + ftp://ftp.inria.fr/pub/CPAN/ + http://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ + ftp://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ + ftp://ftp.lip6.fr/pub/perl/CPAN/ + http://mir2.ovh.net/ftp.cpan.org + ftp://mir1.ovh.net/ftp.cpan.org + ftp://ftp.oleane.net/pub/CPAN/ + http://ftp.crihan.fr/mirrors/ftp.cpan.org/ + ftp://ftp.crihan.fr/mirrors/ftp.cpan.org/ + http://ftp.u-strasbg.fr/CPAN + ftp://ftp.u-strasbg.fr/CPAN + http://cpan.cict.fr/ + ftp://cpan.cict.fr/pub/CPAN/ =item Germany - ftp://ftp.fu-berlin.de/unix/languages/perl/ - http://mirrors.softliste.de/cpan/ - ftp://ftp.rub.de/pub/CPAN/ - http://www.planet-elektronik.de/CPAN/ - http://ftp.hosteurope.de/pub/CPAN/ - ftp://ftp.hosteurope.de/pub/CPAN/ - http://www.mirrorspace.org/cpan/ - http://mirror.netcologne.de/cpan/ - ftp://mirror.netcologne.de/cpan/ - ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/ - http://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ - ftp://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ - http://mirrors.zerg.biz/cpan/ - http://ftp.gwdg.de/pub/languages/perl/CPAN/ - ftp://ftp.gwdg.de/pub/languages/perl/CPAN/ - http://dl.ambiweb.de/mirrors/ftp.cpan.org/ - http://cpan.mirror.clusters.kg/ - http://cpan.mirror.iphh.net/ - ftp://cpan.mirror.iphh.net/pub/CPAN/ - http://cpan.mirroring.de/ - http://mirror.informatik.uni-mannheim.de/pub/mirrors/CPAN/ - ftp://mirror.informatik.uni-mannheim.de/pub/mirrors/CPAN/ - http://ftp.cw.net/pub/CPAN/ - ftp://ftp.cw.net/pub/CPAN/ - http://cpan.cpantesters.org/ - ftp://cpan.cpantesters.org/CPAN/ - http://cpan.mirrored.de/ - ftp://mirror.petamem.com/CPAN/ - http://cpan.noris.de/ - ftp://cpan.noris.de/pub/CPAN/ - ftp://ftp.mpi-sb.mpg.de/pub/perl/CPAN/ - ftp://ftp.gmd.de/mirrors/CPAN/ + ftp://ftp.fu-berlin.de/unix/languages/perl/ + http://mirrors.softliste.de/cpan/ + ftp://ftp.rub.de/pub/CPAN/ + http://www.planet-elektronik.de/CPAN/ + http://ftp.hosteurope.de/pub/CPAN/ + ftp://ftp.hosteurope.de/pub/CPAN/ + http://www.mirrorspace.org/cpan/ + http://mirror.netcologne.de/cpan/ + ftp://mirror.netcologne.de/cpan/ + ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/ + http://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ + ftp://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ + http://mirrors.zerg.biz/cpan/ + http://ftp.gwdg.de/pub/languages/perl/CPAN/ + ftp://ftp.gwdg.de/pub/languages/perl/CPAN/ + http://dl.ambiweb.de/mirrors/ftp.cpan.org/ + http://cpan.mirror.clusters.kg/ + http://cpan.mirror.iphh.net/ + ftp://cpan.mirror.iphh.net/pub/CPAN/ + http://cpan.mirroring.de/ + http://mirror.informatik.uni-mannheim.de/pub/mirrors/CPAN/ + ftp://mirror.informatik.uni-mannheim.de/pub/mirrors/CPAN/ + http://www.chemmedia.de/mirrors/CPAN/ + http://ftp.cw.net/pub/CPAN/ + ftp://ftp.cw.net/pub/CPAN/ + http://cpan.cpantesters.org/ + ftp://cpan.cpantesters.org/CPAN/ + http://cpan.mirrored.de/ + ftp://mirror.petamem.com/CPAN/ + http://cpan.noris.de/ + ftp://cpan.noris.de/pub/CPAN/ + ftp://ftp.mpi-sb.mpg.de/pub/perl/CPAN/ + ftp://ftp.gmd.de/mirrors/CPAN/ =item Greece - ftp://ftp.forthnet.gr/pub/languages/perl/CPAN - ftp://ftp.ntua.gr/pub/lang/perl/ - http://cpan.cc.uoc.gr/ - ftp://ftp.cc.uoc.gr/mirrors/CPAN/ + ftp://ftp.forthnet.gr/pub/languages/perl/CPAN + ftp://ftp.ntua.gr/pub/lang/perl/ + http://cpan.cc.uoc.gr/ + ftp://ftp.cc.uoc.gr/mirrors/CPAN/ =item Hungary - http://cpan.mirrors.enexis.hu/ - ftp://cpan.mirrors.enexis.hu/mirrors/cpan/ - http://cpan.hu/ + http://cpan.mirrors.enexis.hu/ + ftp://cpan.mirrors.enexis.hu/mirrors/cpan/ + http://cpan.hu/ =item Iceland - http://ftp.rhnet.is/pub/CPAN/ - ftp://ftp.rhnet.is/pub/CPAN/ + http://ftp.rhnet.is/pub/CPAN/ + ftp://ftp.rhnet.is/pub/CPAN/ =item Ireland - http://ftp.esat.net/pub/languages/perl/CPAN/ - ftp://ftp.esat.net/pub/languages/perl/CPAN/ - http://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN - ftp://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN + http://ftp.esat.net/pub/languages/perl/CPAN/ + ftp://ftp.esat.net/pub/languages/perl/CPAN/ + http://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN + ftp://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN =item Italy - http://bo.mirror.garr.it/mirrors/CPAN/ - http://cpan.panu.it/ - ftp://ftp.panu.it/pub/mirrors/perl/CPAN/ - http://cpan.fastbull.org/ + http://bo.mirror.garr.it/mirrors/CPAN/ + http://cpan.panu.it/ + ftp://ftp.panu.it/pub/mirrors/perl/CPAN/ =item Latvia - http://kvin.lv/pub/CPAN/ + http://kvin.lv/pub/CPAN/ =item Lithuania - http://ftp.litnet.lt/pub/CPAN/ - ftp://ftp.litnet.lt/pub/CPAN/ + http://ftp.litnet.lt/pub/CPAN/ + ftp://ftp.litnet.lt/pub/CPAN/ =item Malta - http://cpan.waldonet.net.mt/ + http://cpan.waldonet.net.mt/ =item Netherlands - ftp://ftp.quicknet.nl/pub/CPAN/ - http://mirror.hostfuss.com/CPAN/ - ftp://mirror.hostfuss.com/CPAN/ - http://mirrors3.kernel.org/cpan/ - ftp://mirrors3.kernel.org/pub/CPAN/ - http://cpan.osmirror.nl/ - ftp://ftp.osmirror.nl/pub/cpan/ - http://cpan.mirror.versatel.nl/ - ftp://ftp.mirror.versatel.nl/cpan/ - ftp://download.xs4all.nl/pub/mirror/CPAN/ - http://mirror.leaseweb.com/CPAN/ - ftp://mirror.leaseweb.com/CPAN/ - ftp://ftp.cpan.nl/pub/CPAN/ - http://archive.cs.uu.nl/mirror/CPAN/ - ftp://ftp.cs.uu.nl/mirror/CPAN/ - http://https://luxitude.net/cpan/ + ftp://ftp.quicknet.nl/pub/CPAN/ + http://mirror.hostfuss.com/CPAN/ + ftp://mirror.hostfuss.com/CPAN/ + http://mirrors3.kernel.org/cpan/ + ftp://mirrors3.kernel.org/pub/CPAN/ + http://cpan.mirror.versatel.nl/ + ftp://ftp.mirror.versatel.nl/cpan/ + ftp://download.xs4all.nl/pub/mirror/CPAN/ + http://mirror.leaseweb.com/CPAN/ + ftp://mirror.leaseweb.com/CPAN/ + ftp://ftp.cpan.nl/pub/CPAN/ + http://archive.cs.uu.nl/mirror/CPAN/ + ftp://ftp.cs.uu.nl/mirror/CPAN/ + http://luxitude.net/cpan/ =item Norway - ftp://ftp.uninett.no/pub/languages/perl/CPAN - ftp://ftp.uit.no/pub/languages/perl/cpan/ + ftp://ftp.uninett.no/pub/languages/perl/CPAN + ftp://ftp.uit.no/pub/languages/perl/cpan/ =item Poland - http://mirror.icis.pcz.pl/CPAN/ - ftp://mirror.icis.pcz.pl/CPAN/ - http://piotrkosoft.net/pub/mirrors/CPAN/ - ftp://ftp.piotrkosoft.net/pub/mirrors/CPAN/ - http://ftp.man.poznan.pl/pub/CPAN - ftp://ftp.man.poznan.pl/pub/CPAN - ftp://sunsite.icm.edu.pl/pub/CPAN/ - ftp://ftp.tpnet.pl/d4/CPAN/ + http://piotrkosoft.net/pub/mirrors/CPAN/ + ftp://ftp.piotrkosoft.net/pub/mirrors/CPAN/ + http://ftp.man.poznan.pl/pub/CPAN + ftp://ftp.man.poznan.pl/pub/CPAN + ftp://ftp.ps.pl/pub/CPAN/ + ftp://sunsite.icm.edu.pl/pub/CPAN/ + ftp://ftp.tpnet.pl/d4/CPAN/ =item Portugal - http://cpan.dei.uc.pt/ - ftp://ftp.dei.uc.pt/pub/CPAN - ftp://ftp.ist.utl.pt/pub/CPAN/ - http://cpan.perl.pt/ - http://cpan.ip.pt/ - ftp://cpan.ip.pt/pub/cpan/ - http://mirrors.nfsi.pt/CPAN/ - ftp://mirrors.nfsi.pt/pub/CPAN/ - http://cpan.dcc.fc.up.pt/ + http://cpan.dei.uc.pt/ + ftp://ftp.dei.uc.pt/pub/CPAN + ftp://ftp.ist.utl.pt/pub/CPAN/ + http://cpan.perl.pt/ + http://cpan.ip.pt/ + ftp://cpan.ip.pt/pub/cpan/ + http://mirrors.nfsi.pt/CPAN/ + ftp://mirrors.nfsi.pt/pub/CPAN/ + http://cpan.dcc.fc.up.pt/ =item Romania - http://ftp.astral.ro/pub/CPAN/ - ftp://ftp.astral.ro/pub/CPAN/ - ftp://ftp.lug.ro/CPAN - http://mirrors.xservers.ro/CPAN/ - http://mirrors.hostingromania.ro/ftp.cpan.org/ - ftp://ftp.hostingromania.ro/mirrors/ftp.cpan.org/ - ftp://ftp.iasi.roedu.net/pub/mirrors/ftp.cpan.org/ - ftp://ftp.ambra.ro/pub/CPAN + http://ftp.astral.ro/pub/CPAN/ + ftp://ftp.astral.ro/pub/CPAN/ + ftp://ftp.lug.ro/CPAN + http://mirrors.xservers.ro/CPAN/ + http://mirrors.hostingromania.ro/ftp.cpan.org/ + ftp://ftp.hostingromania.ro/mirrors/ftp.cpan.org/ + ftp://ftp.iasi.roedu.net/pub/mirrors/ftp.cpan.org/ =item Russia - ftp://ftp.aha.ru/CPAN/ - http://cpan.rinet.ru/ - ftp://cpan.rinet.ru/pub/mirror/CPAN/ - ftp://ftp.SpringDaemons.com/pub/CPAN/ - http://cpan.nx1.ru/ - ftp://cpan.nx1.ru/ - http://mirror.rol.ru/CPAN/ - http://ftp.silvernet.ru/CPAN/ - http://ftp.spbu.ru/CPAN/ - ftp://ftp.spbu.ru/CPAN/ + ftp://ftp.aha.ru/CPAN/ + http://cpan.rinet.ru/ + ftp://cpan.rinet.ru/pub/mirror/CPAN/ + ftp://ftp.SpringDaemons.com/pub/CPAN/ + http://mirror.rol.ru/CPAN/ + http://ftp.silvernet.ru/CPAN/ + http://ftp.spbu.ru/CPAN/ + ftp://ftp.spbu.ru/CPAN/ =item Slovakia - http://cpan.fyxm.net/ + http://cpan.fyxm.net/ =item Slovenia - http://www.klevze.si/cpan + http://www.klevze.si/cpan =item Spain - http://osl.ugr.es/CPAN/ - ftp://ftp.rediris.es/mirror/CPAN/ - http://ftp.gui.uva.es/sites/cpan.org/ - ftp://ftp.gui.uva.es/sites/cpan.org/ + http://osl.ugr.es/CPAN/ + ftp://ftp.rediris.es/mirror/CPAN/ + http://ftp.gui.uva.es/sites/cpan.org/ + ftp://ftp.gui.uva.es/sites/cpan.org/ =item Sweden - http://mirrors4.kernel.org/cpan/ - ftp://mirrors4.kernel.org/pub/CPAN/ + http://mirrors4.kernel.org/cpan/ + ftp://mirrors4.kernel.org/pub/CPAN/ =item Switzerland - http://cpan.mirror.solnet.ch/ - ftp://ftp.solnet.ch/mirror/CPAN/ - http://mirror.switch.ch/ftp/mirror/CPAN/ - ftp://mirror.switch.ch/mirror/CPAN/ + http://cpan.mirror.solnet.ch/ + ftp://ftp.solnet.ch/mirror/CPAN/ + ftp://ftp.adwired.ch/CPAN/ + http://mirror.switch.ch/ftp/mirror/CPAN/ + ftp://mirror.switch.ch/mirror/CPAN/ =item Ukraine - http://cpan.makeperl.org/ - ftp://cpan.makeperl.org/pub/CPAN - http://cpan.org.ua/ - http://no-more.kiev.ua/CPAN/ - ftp://no-more.kiev.ua/pub/CPAN/ - http://cpan.gafol.net/ - ftp://ftp.gafol.net/pub/cpan/ + http://cpan.makeperl.org/ + ftp://cpan.makeperl.org/pub/CPAN + http://cpan.org.ua/ + http://cpan.gafol.net/ + ftp://ftp.gafol.net/pub/cpan/ =item United Kingdom - http://www.mirrorservice.org/sites/ftp.funet.fi/pub/languages/perl/CPAN/ - ftp://ftp.mirrorservice.org/sites/ftp.funet.fi/pub/languages/perl/CPAN/ - http://mirror.tje.me.uk/pub/mirrors/ftp.cpan.org/ - ftp://mirror.tje.me.uk/pub/mirrors/ftp.cpan.org/ - http://www.mirror.8086.net/sites/CPAN/ - ftp://ftp.mirror.8086.net/sites/CPAN/ - http://cpan.mirror.anlx.net/ - ftp://ftp.mirror.anlx.net/CPAN/ - http://mirror.bytemark.co.uk/CPAN/ - ftp://mirror.bytemark.co.uk/CPAN/ - http://cpan.etla.org/ - ftp://cpan.etla.org/pub/CPAN - ftp://ftp.demon.co.uk/pub/CPAN/ - http://mirror.sov.uk.goscomb.net/CPAN/ - ftp://mirror.sov.uk.goscomb.net/pub/CPAN/ - http://ftp.plig.net/pub/CPAN/ - ftp://ftp.plig.net/pub/CPAN/ - http://ftp.ticklers.org/pub/CPAN/ - ftp://ftp.ticklers.org/pub/CPAN/ - http://cpan.mirrors.uk2.net/ - ftp://mirrors.uk2.net/pub/CPAN/ - http://mirror.ox.ac.uk/sites/www.cpan.org/ - ftp://mirror.ox.ac.uk/sites/www.cpan.org/ + http://www.mirrorservice.org/sites/ftp.funet.fi/pub/languages/perl/CPAN/ + ftp://ftp.mirrorservice.org/sites/ftp.funet.fi/pub/languages/perl/CPAN/ + http://mirror.tje.me.uk/pub/mirrors/ftp.cpan.org/ + ftp://mirror.tje.me.uk/pub/mirrors/ftp.cpan.org/ + http://www.mirror.8086.net/sites/CPAN/ + ftp://ftp.mirror.8086.net/sites/CPAN/ + http://cpan.mirror.anlx.net/ + ftp://ftp.mirror.anlx.net/CPAN/ + http://mirror.bytemark.co.uk/CPAN/ + ftp://mirror.bytemark.co.uk/CPAN/ + http://cpan.etla.org/ + ftp://cpan.etla.org/pub/CPAN + ftp://ftp.demon.co.uk/pub/CPAN/ + http://mirror.sov.uk.goscomb.net/CPAN/ + ftp://mirror.sov.uk.goscomb.net/pub/CPAN/ + http://ftp.plig.net/pub/CPAN/ + ftp://ftp.plig.net/pub/CPAN/ + http://ftp.ticklers.org/pub/CPAN/ + ftp://ftp.ticklers.org/pub/CPAN/ + http://cpan.mirrors.uk2.net/ + ftp://mirrors.uk2.net/pub/CPAN/ + http://mirror.ox.ac.uk/sites/www.cpan.org/ + ftp://mirror.ox.ac.uk/sites/www.cpan.org/ =back @@ -734,30 +726,28 @@ Registered CPAN sites =item Bahamas - http://www.securehost.com/mirror/CPAN/ + http://www.securehost.com/mirror/CPAN/ =item Canada - http://cpan.justanotherperlhacker.com/pub/CPAN/ - ftp://cpan.justanotherperlhacker.com/pub/CPAN/ - http://cpan.arcticnetwork.ca - ftp://mirror.arcticnetwork.ca/pub/CPAN - http://cpan.sunsite.ualberta.ca/ - ftp://cpan.sunsite.ualberta.ca/pub/CPAN/ - http://theoryx5.uwinnipeg.ca/pub/CPAN/ - ftp://theoryx5.uwinnipeg.ca/pub/CPAN/ - http://arwen.cs.dal.ca/mirror/CPAN/ - ftp://arwen.cs.dal.ca/pub/mirror/CPAN/ - http://CPAN.mirror.rafal.ca/ - ftp://CPAN.mirror.rafal.ca/pub/CPAN/ - ftp://ftp.nrc.ca/pub/CPAN/ - http://mirror.csclub.uwaterloo.ca/pub/CPAN/ - ftp://mirror.csclub.uwaterloo.ca/pub/CPAN/ + http://cpan.arcticnetwork.ca + ftp://mirror.arcticnetwork.ca/pub/CPAN + http://cpan.sunsite.ualberta.ca/ + ftp://cpan.sunsite.ualberta.ca/pub/CPAN/ + http://theoryx5.uwinnipeg.ca/pub/CPAN/ + ftp://theoryx5.uwinnipeg.ca/pub/CPAN/ + http://arwen.cs.dal.ca/mirror/CPAN/ + ftp://arwen.cs.dal.ca/pub/mirror/CPAN/ + http://CPAN.mirror.rafal.ca/ + ftp://CPAN.mirror.rafal.ca/pub/CPAN/ + ftp://ftp.nrc.ca/pub/CPAN/ + http://mirror.csclub.uwaterloo.ca/pub/CPAN/ + ftp://mirror.csclub.uwaterloo.ca/pub/CPAN/ =item Mexico - http://www.msg.com.mx/CPAN/ - ftp://ftp.msg.com.mx/pub/CPAN/ + http://www.msg.com.mx/CPAN/ + ftp://ftp.msg.com.mx/pub/CPAN/ =item United States @@ -765,150 +755,145 @@ Registered CPAN sites =item Alabama - http://mirror.hiwaay.net/CPAN/ - ftp://mirror.hiwaay.net/CPAN/ + http://mirror.hiwaay.net/CPAN/ + ftp://mirror.hiwaay.net/CPAN/ + +=item Arizona + + http://cpan.ezarticleinformation.com/ =item California - http://cpan.knowledgematters.net/ - http://cpan.binkerton.com/ - http://cpan.develooper.com/ - http://mirrors.gossamer-threads.com/CPAN - http://cpan.schatt.com/ - http://mirrors.kernel.org/cpan/ - ftp://mirrors.kernel.org/pub/CPAN - http://mirrors2.kernel.org/cpan/ - ftp://mirrors2.kernel.org/pub/CPAN/ - http://cpan.mirrors.redwire.net/ - http://cpan.mirror.facebook.net/ - http://mirrors1.kernel.org/cpan/ - ftp://mirrors1.kernel.org/pub/CPAN/ - http://cpan-sj.viaverio.com/ - ftp://cpan-sj.viaverio.com/pub/CPAN/ - http://www.perl.com/CPAN/ - http://cpan.yahoo.com/ + http://cpan.knowledgematters.net/ + http://cpan.binkerton.com/ + http://cpan.develooper.com/ + http://mirrors.gossamer-threads.com/CPAN + http://cpan.schatt.com/ + http://mirrors.kernel.org/cpan/ + ftp://mirrors.kernel.org/pub/CPAN + http://mirrors2.kernel.org/cpan/ + ftp://mirrors2.kernel.org/pub/CPAN/ + http://cpan.mirror.facebook.net/ + http://mirrors1.kernel.org/cpan/ + ftp://mirrors1.kernel.org/pub/CPAN/ + http://cpan-sj.viaverio.com/ + ftp://cpan-sj.viaverio.com/pub/CPAN/ + http://www.perl.com/CPAN/ =item Florida - ftp://ftp.cise.ufl.edu/pub/mirrors/CPAN/ - http://mirror.atlantic.net/pub/CPAN/ - ftp://mirror.atlantic.net/pub/CPAN/ - http://mirror.candidhosting.com/pub/CPAN - ftp://mirror.candidhosting.com/pub/CPAN + ftp://ftp.cise.ufl.edu/pub/mirrors/CPAN/ + http://mirror.atlantic.net/pub/CPAN/ + ftp://mirror.atlantic.net/pub/CPAN/ =item Idaho - http://mirror.its.uidaho.edu/pub/cpan/ - ftp://mirror.its.uidaho.edu/cpan/ + http://mirror.its.uidaho.edu/pub/cpan/ + ftp://mirror.its.uidaho.edu/cpan/ =item Illinois - http://cpan.mirrors.hoobly.com/ - http://cpan.uchicago.edu/pub/CPAN/ - ftp://cpan.uchicago.edu/pub/CPAN/ - http://mirrors.servercentral.net/CPAN/ - http://www.stathy.com/CPAN/ - ftp://www.stathy.com/CPAN/ + http://cpan.mirrors.hoobly.com/ + http://cpan.uchicago.edu/pub/CPAN/ + ftp://cpan.uchicago.edu/pub/CPAN/ + http://mirrors.servercentral.net/CPAN/ + http://www.stathy.com/CPAN/ + ftp://www.stathy.com/CPAN/ =item Indiana - ftp://ftp.uwsg.iu.edu/pub/perl/CPAN/ - http://cpan.netnitco.net/ - ftp://cpan.netnitco.net/pub/mirrors/CPAN/ - http://ftp.ndlug.nd.edu/pub/perl/ - ftp://ftp.ndlug.nd.edu/pub/perl/ - http://fx.saintjoe.edu/pub/CPAN + ftp://ftp.uwsg.iu.edu/pub/perl/CPAN/ + http://cpan.netnitco.net/ + ftp://cpan.netnitco.net/pub/mirrors/CPAN/ + http://ftp.ndlug.nd.edu/pub/perl/ + ftp://ftp.ndlug.nd.edu/pub/perl/ =item Massachusetts - ftp://ftp.ccs.neu.edu/net/mirrors/ftp.funet.fi/pub/languages/perl/CPAN/ + http://mirrors.ccs.neu.edu/CPAN/ =item Michigan - http://ftp.wayne.edu/cpan/ - ftp://ftp.wayne.edu/cpan/ + http://ftp.wayne.edu/cpan/ + ftp://ftp.wayne.edu/cpan/ =item Minnesota - http://cpan.msi.umn.edu/ + http://cpan.msi.umn.edu/ =item New Jersey - http://mirror.datapipe.net/CPAN/ - ftp://mirror.datapipe.net/pub/CPAN/ + http://mirror.datapipe.net/CPAN/ + ftp://mirror.datapipe.net/pub/CPAN/ =item New York - http://mirrors.24-7-solutions.net/pub/CPAN/ - ftp://mirrors.24-7-solutions.net/pub/CPAN/ - http://mirror.cc.columbia.edu/pub/software/cpan/ - ftp://mirror.cc.columbia.edu/pub/software/cpan/ - http://cpan.belfry.net/ - http://cpan.erlbaum.net/ - ftp://cpan.erlbaum.net/CPAN/ - http://cpan.hexten.net/ - ftp://cpan.hexten.net/ - http://ftp.fxcorporate.com/CPAN/ - ftp://ftp.fxcorporate.com/pub/CPAN/ - ftp://mirror.nyi.net/CPAN/ - http://mirror.rit.edu/CPAN/ - ftp://mirror.rit.edu/CPAN/ + http://mirrors.24-7-solutions.net/pub/CPAN/ + ftp://mirrors.24-7-solutions.net/pub/CPAN/ + http://mirror.cc.columbia.edu/pub/software/cpan/ + ftp://mirror.cc.columbia.edu/pub/software/cpan/ + http://cpan.belfry.net/ + http://cpan.erlbaum.net/ + ftp://cpan.erlbaum.net/CPAN/ + http://cpan.hexten.net/ + ftp://cpan.hexten.net/ + ftp://mirror.nyi.net/CPAN/ + http://mirror.rit.edu/CPAN/ + ftp://mirror.rit.edu/CPAN/ =item North Carolina - http://www.ibiblio.org/pub/mirrors/CPAN - ftp://ftp.ncsu.edu/pub/mirror/CPAN/ + http://www.ibiblio.org/pub/mirrors/CPAN + ftp://ftp.ncsu.edu/pub/mirror/CPAN/ =item Oregon - http://ftp.osuosl.org/pub/CPAN/ - ftp://ftp.osuosl.org/pub/CPAN/ + http://ftp.osuosl.org/pub/CPAN/ + ftp://ftp.osuosl.org/pub/CPAN/ =item Pennsylvania - http://ftp.epix.net/CPAN/ - ftp://ftp.epix.net/pub/languages/perl/ - http://cpan.pair.com/ - ftp://cpan.pair.com/pub/CPAN/ + http://ftp.epix.net/CPAN/ + ftp://ftp.epix.net/pub/languages/perl/ + http://cpan.pair.com/ + ftp://cpan.pair.com/pub/CPAN/ =item South Carolina - http://cpan.mirror.clemson.edu/ + http://cpan.mirror.clemson.edu/ =item Tennessee - http://mira.sunsite.utk.edu/CPAN/ + http://mira.sunsite.utk.edu/CPAN/ =item Texas - http://mirror.uta.edu/CPAN + http://mirror.uta.edu/CPAN =item Utah - http://cpan.cs.utah.edu - ftp://cpan.cs.utah.edu/pub/CPAN/ - ftp://mirror.xmission.com/CPAN/ + ftp://mirror.xmission.com/CPAN/ =item Virginia - http://cpan-du.viaverio.com/ - ftp://cpan-du.viaverio.com/pub/CPAN/ - http://perl.secsup.org/ - ftp://perl.secsup.org/pub/perl/ - ftp://mirror.cogentco.com/pub/CPAN/ + http://cpan-du.viaverio.com/ + ftp://cpan-du.viaverio.com/pub/CPAN/ + http://perl.secsup.org/ + ftp://perl.secsup.org/pub/perl/ + ftp://mirror.cogentco.com/pub/CPAN/ =item Washington - http://cpan.llarian.net/ - ftp://cpan.llarian.net/pub/CPAN/ - ftp://ftp-mirror.internap.com/pub/CPAN/ + http://cpan.llarian.net/ + ftp://cpan.llarian.net/pub/CPAN/ + ftp://ftp-mirror.internap.com/pub/CPAN/ =item Wisconsin - http://cpan.mirrors.tds.net - ftp://cpan.mirrors.tds.net/pub/CPAN - http://mirror.sit.wisc.edu/pub/CPAN/ - ftp://mirror.sit.wisc.edu/pub/CPAN/ + http://cpan.mirrors.tds.net + ftp://cpan.mirrors.tds.net/pub/CPAN + http://mirror.sit.wisc.edu/pub/CPAN/ + ftp://mirror.sit.wisc.edu/pub/CPAN/ =back @@ -920,19 +905,19 @@ Registered CPAN sites =item Australia - http://mirror.internode.on.net/pub/cpan/ - ftp://mirror.internode.on.net/pub/cpan/ - http://cpan.mirror.aussiehq.net.au/ - http://mirror.as24220.net/cpan/ - ftp://mirror.as24220.net/cpan/ + http://mirror.internode.on.net/pub/cpan/ + ftp://mirror.internode.on.net/pub/cpan/ + http://cpan.mirror.aussiehq.net.au/ + http://mirror.as24220.net/cpan/ + ftp://mirror.as24220.net/cpan/ =item New Zealand - ftp://ftp.auckland.ac.nz/pub/perl/CPAN/ - http://cpan.inspire.net.nz - ftp://cpan.inspire.net.nz/cpan - http://cpan.catalyst.net.nz/CPAN/ - ftp://cpan.catalyst.net.nz/pub/CPAN/ + ftp://ftp.auckland.ac.nz/pub/perl/CPAN/ + http://cpan.inspire.net.nz + ftp://cpan.inspire.net.nz/cpan + http://cpan.catalyst.net.nz/CPAN/ + ftp://cpan.catalyst.net.nz/pub/CPAN/ =back @@ -942,26 +927,25 @@ Registered CPAN sites =item Argentina - http://cpan.patan.com.ar/ - http://cpan.localhost.net.ar - ftp://mirrors.localhost.net.ar/pub/mirrors/CPAN + http://cpan.patan.com.ar/ + http://cpan.localhost.net.ar + ftp://mirrors.localhost.net.ar/pub/mirrors/CPAN =item Brazil - ftp://cpan.pop-mg.com.br/pub/CPAN/ - http://ftp.pucpr.br/CPAN - ftp://ftp.pucpr.br/CPAN - http://cpan.kinghost.net/ - ftp://ftp.linorg.usp.br/CPAN + ftp://cpan.pop-mg.com.br/pub/CPAN/ + http://ftp.pucpr.br/CPAN + ftp://ftp.pucpr.br/CPAN + http://cpan.kinghost.net/ =item Chile - http://cpan.dcc.uchile.cl/ - ftp://cpan.dcc.uchile.cl/pub/lang/cpan/ + http://cpan.dcc.uchile.cl/ + ftp://cpan.dcc.uchile.cl/pub/lang/cpan/ =item Colombia - http://www.laqee.unal.edu.co/CPAN/ + http://www.laqee.unal.edu.co/CPAN/ =back @@ -1064,7 +1048,7 @@ its methods by loading dynamic C or C++ objects, but that should be totally transparent to the user of the module. Likewise, the module might set up an AUTOLOAD function to slurp in subroutine definitions on demand, but this is also transparent. Only the F<.pm> file is required to -exist. See L<perlsub>, L<perltoot>, and L<AutoLoader> for details about +exist. See L<perlsub>, L<perlobj>, and L<AutoLoader> for details about the AUTOLOAD mechanism. =head2 Guidelines for Module Creation @@ -1124,7 +1108,7 @@ Let the objects look after themselves! Generally, avoid hard-wired class names as far as possible. Avoid C<< $r->Class::func() >> where using C<@ISA=qw(... Class ...)> and -C<< $r->func() >> would work (see L<perlbot> for more details). +C<< $r->func() >> would work. Use autosplit so little used or newly added functions won't be a burden to programs that don't use them. Add test functions to @@ -1348,7 +1332,7 @@ Give the module a version/issue/release number. To be fully compatible with the Exporter and MakeMaker modules you should store your module's version number in a non-my package -variable called $VERSION. This should be a floating point +variable called $VERSION. This should be a positive floating point number with at least two digits after the decimal (i.e., hundredths, e.g, C<$VERSION = "0.01">). Don't use a "1.3.2" style version. See L<Exporter> for details. @@ -1409,6 +1393,8 @@ old behavior if people rely on it. Document incompatible changes. =back +=back + =head2 Guidelines for Converting Perl 4 Library Scripts into Modules =over 4 @@ -1507,8 +1493,6 @@ or =back -=back - =head1 NOTE Perl does not enforce private and public parts of its modules as you may @@ -1523,7 +1507,8 @@ that a module doesn't pollute any namespace it wasn't asked to. The written contract for the module (A.K.A. documentation) may make other provisions. But then you know when you C<use RedefineTheWorld> that you're redefining the world and willing to take the consequences. -EOF + +=cut close MANIFEST or warn "$0: failed to close MANIFEST (../MANIFEST): $!"; close OUT or warn "$0: failed to close OUT (perlmodlib.pod): $!"; diff --git a/gnu/usr.bin/perl/pod/perlmodstyle.pod b/gnu/usr.bin/perl/pod/perlmodstyle.pod index dfe5662f942..df813a0f3dd 100644 --- a/gnu/usr.bin/perl/pod/perlmodstyle.pod +++ b/gnu/usr.bin/perl/pod/perlmodstyle.pod @@ -254,55 +254,58 @@ Your module may be object oriented (OO) or not, or it may have both kinds of interfaces available. There are pros and cons of each technique, which should be considered when you design your API. -According to Damian Conway, you should consider using OO: +In I<Perl Best Practices> (copyright 2004, Published by O'Reilly Media, Inc.), +Damian Conway provides a list of criteria to use when deciding if OO is the +right fit for your problem: =over 4 -=item * +=item * -When the system is large or likely to become so +The system being designed is large, or is likely to become large. -=item * +=item * -When the data is aggregated in obvious structures that will become objects +The data can be aggregated into obvious structures, especially if +there's a large amount of data in each aggregate. -=item * +=item * -When the types of data form a natural hierarchy that can make use of inheritance +The various types of data aggregate form a natural hierarchy that +facilitates the use of inheritance and polymorphism. =item * -When operations on data vary according to data type (making -polymorphic invocation of methods feasible) +You have a piece of data on which many different operations are +applied. =item * -When it is likely that new data types may be later introduced -into the system, and will need to be handled by existing code +You need to perform the same general operations on related types of +data, but with slight variations depending on the specific type of data +the operations are applied to. =item * -When interactions between data are best represented by -overloaded operators +It's likely you'll have to add new data types later. =item * -When the implementation of system components is likely to -change over time (and hence should be encapsulated) +The typical interactions between pieces of data are best represented by +operators. =item * -When the system design is itself object-oriented +The implementation of individual components of the system is likely to +change over time. =item * -When large amounts of client code will use the software (and -should be insulated from changes in its implementation) +The system design is already object-oriented. =item * -When many separate operations will need to be applied to the -same set of data +Large numbers of other programmers will be using your code modules. =back diff --git a/gnu/usr.bin/perl/pod/perlmroapi.pod b/gnu/usr.bin/perl/pod/perlmroapi.pod index 2200becded2..54da7dded25 100644 --- a/gnu/usr.bin/perl/pod/perlmroapi.pod +++ b/gnu/usr.bin/perl/pod/perlmroapi.pod @@ -9,7 +9,7 @@ resolution orders other than the default (linear depth first search). The C3 method resolution order added in 5.10.0 has been re-implemented as a plugin, without changing its Perl-space interface. -Each plugin should register itself with C<Perl_mro_register> by providing +Each plugin should register itself by providing the following structure struct mro_alg { @@ -20,6 +20,10 @@ the following structure U32 hash; }; +and calling C<Perl_mro_register>: + + Perl_mro_register(aTHX_ &my_mro_alg); + =over 4 =item resolve @@ -54,8 +58,12 @@ function - the parameter is provided to allow your implementation to track depth if it needs to recurse. The function should return a reference to an array containing the parent -classes in order. The caller is responsible for incrementing the reference -count if it wants to keep the structure. Hence if you have created a +classes in order. The names of the classes should be the result of calling +C<HvENAME()> on the stash. In those cases where C<HvENAME()> returns null, +C<HvNAME()> should be used instead. + +The caller is responsible for incrementing the reference count of the array +returned if it wants to keep the structure. Hence, if you have created a temporary value that you keep no pointer to, C<sv_2mortal()> to ensure that it is disposed of correctly. If you have cached your return value, then return a pointer to it without changing the reference count. diff --git a/gnu/usr.bin/perl/pod/perlootut.pod b/gnu/usr.bin/perl/pod/perlootut.pod new file mode 100644 index 00000000000..b2e3500b358 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlootut.pod @@ -0,0 +1,741 @@ +=encoding utf8 + +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlootut.pod + +=head1 NAME + +perlootut - Object-Oriented Programming in Perl Tutorial + +=head1 DATE + +This document was created in February, 2011. + +=head1 DESCRIPTION + +This document provides an introduction to object-oriented programming +in Perl. It begins with a brief overview of the concepts behind object +oriented design. Then it introduces several different OO systems from +L<CPAN|http://search.cpan.org> which build on top of what Perl +provides. + +By default, Perl's built-in OO system is very minimal, leaving you to +do most of the work. This minimalism made a lot of sense in 1994, but +in the years since Perl 5.0 we've seen a number of common patterns +emerge in Perl OO. Fortunately, Perl's flexibility has allowed a rich +ecosystem of Perl OO systems to flourish. + +If you want to know how Perl OO works under the hood, the L<perlobj> +document explains the nitty gritty details. + +This document assumes that you already understand the basics of Perl +syntax, variable types, operators, and subroutine calls. If you don't +understand these concepts yet, please read L<perlintro> first. You +should also read the L<perlsyn>, L<perlop>, and L<perlsub> documents. + +=head1 OBJECT-ORIENTED FUNDAMENTALS + +Most object systems share a number of common concepts. You've probably +heard terms like "class", "object, "method", and "attribute" before. +Understanding the concepts will make it much easier to read and write +object-oriented code. If you're already familiar with these terms, you +should still skim this section, since it explains each concept in terms +of Perl's OO implementation. + +Perl's OO system is class-based. Class-based OO is fairly common. It's +used by Java, C++, C#, Python, Ruby, and many other languages. There +are other object orientation paradigms as well. JavaScript is the most +popular language to use another paradigm. JavaScript's OO system is +prototype-based. + +=head2 Object + +An B<object> is a data structure that bundles together data and +subroutines which operate on that data. An object's data is called +B<attributes>, and its subroutines are called B<methods>. An object can +be thought of as a noun (a person, a web service, a computer). + +An object represents a single discrete thing. For example, an object +might represent a file. The attributes for a file object might include +its path, content, and last modification time. If we created an object +to represent F</etc/hostname> on a machine named "foo.example.com", +that object's path would be "/etc/hostname", its content would be +"foo\n", and it's last modification time would be 1304974868 seconds +since the beginning of the epoch. + +The methods associated with a file might include C<rename()> and +C<write()>. + +In Perl most objects are hashes, but the OO systems we recommend keep +you from having to worry about this. In practice, it's best to consider +an object's internal data structure opaque. + +=head2 Class + +A B<class> defines the behavior of a category of objects. A class is a +name for a category (like "File"), and a class also defines the +behavior of objects in that category. + +All objects belong to a specific class. For example, our +F</etc/hostname> object belongs to the C<File> class. When we want to +create a specific object, we start with its class, and B<construct> or +B<instantiate> an object. A specific object is often referred to as an +B<instance> of a class. + +In Perl, any package can be a class. The difference between a package +which is a class and one which isn't is based on how the package is +used. Here's our "class declaration" for the C<File> class: + + package File; + +In Perl, there is no special keyword for constructing an object. +However, most OO modules on CPAN use a method named C<new()> to +construct a new object: + + my $hostname = File->new( + path => '/etc/hostname', + content => "foo\n", + last_mod_time => 1304974868, + ); + +(Don't worry about that C<< -> >> operator, it will be explained +later.) + +=head3 Blessing + +As we said earlier, most Perl objects are hashes, but an object can be +an instance of any Perl data type (scalar, array, etc.). Turning a +plain data structure into an object is done by B<blessing> that data +structure using Perl's C<bless> function. + +While we strongly suggest you don't build your objects from scratch, +you should know the term B<bless>. A B<blessed> data structure (aka "a +referent") is an object. We sometimes say that an object has been +"blessed into a class". + +Once a referent has been blessed, the C<blessed> function from the +L<Scalar::Util> core module can tell us its class name. This subroutine +returns an object's class when passed an object, and false otherwise. + + use Scalar::Util 'blessed'; + + print blessed($hash); # undef + print blessed($hostname); # File + +=head3 Constructor + +A B<constructor> creates a new object. In Perl, a class's constructor +is just another method, unlike some other languages, which provide +syntax for constructors. Most Perl classes use C<new> as the name for +their constructor: + + my $file = File->new(...); + +=head2 Methods + +You already learned that a B<method> is a subroutine that operates on +an object. You can think of a method as the things that an object can +I<do>. If an object is a noun, then methods are its verbs (save, print, +open). + +In Perl, methods are simply subroutines that live in a class's package. +Methods are always written to receive the object as their first +argument: + + sub print_info { + my $self = shift; + + print "This file is at ", $self->path, "\n"; + } + + $file->print_info; + # The file is at /etc/hostname + +What makes a method special is I<how it's called>. The arrow operator +(C<< -> >>) tells Perl that we are calling a method. + +When we make a method call, Perl arranges for the method's B<invocant> +to be passed as the first argument. B<Invocant> is a fancy name for the +thing on the left side of the arrow. The invocant can either be a class +name or an object. We can also pass additional arguments to the method: + + sub print_info { + my $self = shift; + my $prefix = shift // "This file is at "; + + print $prefix, ", ", $self->path, "\n"; + } + + $file->print_info("The file is located at "); + # The file is located at /etc/hostname + +=head2 Attributes + +Each class can define its B<attributes>. When we instantiate an object, +we assign values to those attributes. For example, every C<File> object +has a path. Attributes are sometimes called B<properties>. + +Perl has no special syntax for attributes. Under the hood, attributes +are often stored as keys in the object's underlying hash, but don't +worry about this. + +We recommend that you only access attributes via B<accessor> methods. +These are methods that can get or set the value of each attribute. We +saw this earlier in the C<print_info()> example, which calls C<< +$self->path >>. + +You might also see the terms B<getter> and B<setter>. These are two +types of accessors. A getter gets the attribute's value, while a setter +sets it. Another term for a setter is B<mutator> + +Attributes are typically defined as read-only or read-write. Read-only +attributes can only be set when the object is first created, while +read-write attributes can be altered at any time. + +The value of an attribute may itself be another object. For example, +instead of returning its last mod time as a number, the C<File> class +could return a L<DateTime> object representing that value. + +It's possible to have a class that does not expose any publicly +settable attributes. Not every class has attributes and methods. + +=head2 Polymorphism + +B<Polymorphism> is a fancy way of saying that objects from two +different classes share an API. For example, we could have C<File> and +C<WebPage> classes which both have a C<print_content()> method. This +method might produce different output for each class, but they share a +common interface. + +While the two classes may differ in many ways, when it comes to the +C<print_content()> method, they are the same. This means that we can +try to call the C<print_content()> method on an object of either class, +and B<we don't have to know what class the object belongs to!> + +Polymorphism is one of the key concepts of object-oriented design. + +=head2 Inheritance + +B<Inheritance> lets you create a specialized version of an existing +class. Inheritance lets the new class to reuse the methods and +attributes of another class. + +For example, we could create an C<File::MP3> class which B<inherits> +from C<File>. An C<File::MP3> B<is-a> I<more specific> type of C<File>. +All mp3 files are files, but not all files are mp3 files. + +We often refer to inheritance relationships as B<parent-child> or +C<superclass/subclass> relationships. Sometimes we say that the child +has an B<is-a> relationship with its parent class. + +C<File> is a B<superclass> of C<File::MP3>, and C<File::MP3> is a +B<subclass> of C<File>. + + package File::MP3; + + use parent 'File'; + +The L<parent> module is one of several ways that Perl lets you define +inheritance relationships. + +Perl allows multiple inheritance, which means that a class can inherit +from multiple parents. While this is possible, we strongly recommend +against it. Generally, you can use B<roles> to do everything you can do +with multiple inheritance, but in a cleaner way. + +Note that there's nothing wrong with defining multiple subclasses of a +given class. This is both common and safe. For example, we might define +C<File::MP3::FixedBitrate> and C<File::MP3::VariableBitrate> classes to +distinguish between different types of mp3 file. + +=head3 Overriding methods and method resolution + +Inheritance allows two classes to share code. By default, every method +in the parent class is also available in the child. The child can +explicitly B<override> a parent's method to provide its own +implementation. For example, if we have an C<File::MP3> object, it has +the C<print_info()> method from C<File>: + + my $cage = File::MP3->new( + path => 'mp3s/My-Body-Is-a-Cage.mp3', + content => $mp3_data, + last_mod_time => 1304974868, + title => 'My Body Is a Cage', + ); + + $cage->print_info; + # The file is at mp3s/My-Body-Is-a-Cage.mp3 + +If we wanted to include the mp3's title in the greeting, we could +override the method: + + package File::MP3; + + use parent 'File'; + + sub print_info { + my $self = shift; + + print "This file is at ", $self->path, "\n"; + print "Its title is ", $self->title, "\n"; + } + + $cage->print_info; + # The file is at mp3s/My-Body-Is-a-Cage.mp3 + # Its title is My Body Is a Cage + +The process of determining what method should be used is called +B<method resolution>. What Perl does is look at the object's class +first (C<File::MP3> in this case). If that class defines the method, +then that class's version of the method is called. If not, Perl looks +at each parent class in turn. For C<File::MP3>, its only parent is +C<File>. If C<File::MP3> does not define the method, but C<File> does, +then Perl calls the method in C<File>. + +If C<File> inherited from C<DataSource>, which inherited from C<Thing>, +then Perl would keep looking "up the chain" if necessary. + +It is possible to explicitly call a parent method from a child: + + package File::MP3; + + use parent 'File'; + + sub print_info { + my $self = shift; + + $self->SUPER::print_info(); + print "Its title is ", $self->title, "\n"; + } + +The C<SUPER::> bit tells Perl to look for the C<print_info()> in the +C<File::MP3> class's inheritance chain. When it finds the parent class +that implements this method, the method is called. + +We mentioned multiple inheritance earlier. The main problem with +multiple inheritance is that it greatly complicates method resolution. +See L<perlobj> for more details. + +=head2 Encapsulation + +B<Encapsulation> is the idea that an object is opaque. When another +developer uses your class, they don't need to know I<how> it is +implemented, they just need to know I<what> it does. + +Encapsulation is important for several reasons. First, it allows you to +separate the public API from the private implementation. This means you +can change that implementation without breaking the API. + +Second, when classes are well encapsulated, they become easier to +subclass. Ideally, a subclass uses the same APIs to access object data +that its parent class uses. In reality, subclassing sometimes involves +violating encapsulation, but a good API can minimize the need to do +this. + +We mentioned earlier that most Perl objects are implemented as hashes +under the hood. The principle of encapsulation tells us that we should +not rely on this. Instead, we should use accessor methods to access the +data in that hash. The object systems that we recommend below all +automate the generation of accessor methods. If you use one of them, +you should never have to access the object as a hash directly. + +=head2 Composition + +In object-oriented code, we often find that one object references +another object. This is called B<composition>, or a B<has-a> +relationship. + +Earlier, we mentioned that the C<File> class's C<last_mod_time> +accessor could return a L<DateTime> object. This is a perfect example +of composition. We could go even further, and make the C<path> and +C<content> accessors return objects as well. The C<File> class would +then be B<composed> of several other objects. + +=head2 Roles + +B<Roles> are something that a class I<does>, rather than something that +it I<is>. Roles are relatively new to Perl, but have become rather +popular. Roles are B<applied> to classes. Sometimes we say that classes +B<consume> roles. + +Roles are an alternative to inheritance for providing polymorphism. +Let's assume we have two classes, C<Radio> and C<Computer>. Both of +these things have on/off switches. We want to model that in our class +definitions. + +We could have both classes inherit from a common parent, like +C<Machine>, but not all machines have on/off switches. We could create +a parent class called C<HasOnOffSwitch>, but that is very artificial. +Radios and computers are not specializations of this parent. This +parent is really a rather ridiculous creation. + +This is where roles come in. It makes a lot of sense to create a +C<HasOnOffSwitch> role and apply it to both classes. This role would +define a known API like providing C<turn_on()> and C<turn_off()> +methods. + +Perl does not have any built-in way to express roles. In the past, +people just bit the bullet and used multiple inheritance. Nowadays, +there are several good choices on CPAN for using roles. + +=head2 When to Use OO + +Object Orientation is not the best solution to every problem. In I<Perl +Best Practices> (copyright 2004, Published by O'Reilly Media, Inc.), +Damian Conway provides a list of criteria to use when deciding if OO is +the right fit for your problem: + +=over 4 + +=item * + +The system being designed is large, or is likely to become large. + +=item * + +The data can be aggregated into obvious structures, especially if +there's a large amount of data in each aggregate. + +=item * + +The various types of data aggregate form a natural hierarchy that +facilitates the use of inheritance and polymorphism. + +=item * + +You have a piece of data on which many different operations are +applied. + +=item * + +You need to perform the same general operations on related types of +data, but with slight variations depending on the specific type of data +the operations are applied to. + +=item * + +It's likely you'll have to add new data types later. + +=item * + +The typical interactions between pieces of data are best represented by +operators. + +=item * + +The implementation of individual components of the system is likely to +change over time. + +=item * + +The system design is already object-oriented. + +=item * + +Large numbers of other programmers will be using your code modules. + +=back + +=head1 PERL OO SYSTEMS + +As we mentioned before, Perl's built-in OO system is very minimal, but +also quite flexible. Over the years, many people have developed systems +which build on top of Perl's built-in system to provide more features +and convenience. + +We strongly recommend that you use one of these systems. Even the most +minimal of them eliminates a lot of repetitive boilerplate. There's +really no good reason to write your classes from scratch in Perl. + +If you are interested in the guts underlying these systems, check out +L<perlobj>. + +=head2 Moose + +L<Moose> bills itself as a "postmodern object system for Perl 5". Don't +be scared, the "postmodern" label is a callback to Larry's description +of Perl as "the first postmodern computer language". + +C<Moose> provides a complete, modern OO system. Its biggest influence +is the Common Lisp Object System, but it also borrows ideas from +Smalltalk and several other languages. C<Moose> was created by Stevan +Little, and draws heavily from his work on the Perl 6 OO design. + +Here is our C<File> class using C<Moose>: + + package File; + use Moose; + + has path => ( is => 'ro' ); + has content => ( is => 'ro' ); + has last_mod_time => ( is => 'ro' ); + + sub print_info { + my $self = shift; + + print "This file is at ", $self->path, "\n"; + } + +C<Moose> provides a number of features: + +=over 4 + +=item * Declarative sugar + +C<Moose> provides a layer of declarative "sugar" for defining classes. +That sugar is just a set of exported functions that make declaring how +your class works simpler and more palatable. This lets you describe +I<what> your class is, rather than having to tell Perl I<how> to +implement your class. + +The C<has()> subroutine declares an attribute, and C<Moose> +automatically creates accessors for these attributes. It also takes +care of creating a C<new()> method for you. This constructor knows +about the attributes you declared, so you can set them when creating a +new C<File>. + +=item * Roles built-in + +C<Moose> lets you define roles the same way you define classes: + + package HasOnOfSwitch; + use Moose::Role; + + has is_on => ( + is => 'rw', + isa => 'Bool', + ); + + sub turn_on { + my $self = shift; + $self->is_on(1); + } + + sub turn_off { + my $self = shift; + $self->is_on(0); + } + +=item * A miniature type system + +In the example above, you can see that we passed C<< isa => 'Bool' >> +to C<has()> when creating our C<is_on> attribute. This tells C<Moose> +that this attribute must be a boolean value. If we try to set it to an +invalid value, our code will throw an error. + +=item * Full introspection and manipulation + +Perl's built-in introspection features are fairly minimal. C<Moose> +builds on top of them and creates a full introspection layer for your +classes. This lets you ask questions like "what methods does the File +class implement?" It also lets you modify your classes +programmatically. + +=item * Self-hosted and extensible + +C<Moose> describes itself using its own introspection API. Besides +being a cool trick, this means that you can extend C<Moose> using +C<Moose> itself. + +=item * Rich ecosystem + +There is a rich ecosystem of C<Moose> extensions on CPAN under the +L<MooseX|http://search.cpan.org/search?query=MooseX&mode=dist> +namespace. In addition, many modules on CPAN already use C<Moose>, +providing you with lots of examples to learn from. + +=item * Many more features + +C<Moose> is a very powerful tool, and we can't cover all of its +features here. We encourage you to learn more by reading the C<Moose> +documentation, starting with +L<Moose::Manual|http://search.cpan.org/perldoc?Moose::Manual>. + +=back + +Of course, C<Moose> isn't perfect. + +C<Moose> can make your code slower to load. C<Moose> itself is not +small, and it does a I<lot> of code generation when you define your +class. This code generation means that your runtime code is as fast as +it can be, but you pay for this when your modules are first loaded. + +This load time hit can be a problem when startup speed is important, +such as with a command-line script or a "plain vanilla" CGI script that +must be loaded each time it is executed. + +Before you panic, know that many people do use C<Moose> for +command-line tools and other startup-sensitive code. We encourage you +to try C<Moose> out first before worrying about startup speed. + +C<Moose> also has several dependencies on other modules. Most of these +are small stand-alone modules, a number of which have been spun off +from C<Moose>. C<Moose> itself, and some of its dependencies, require a +compiler. If you need to install your software on a system without a +compiler, or if having I<any> dependencies is a problem, then C<Moose> +may not be right for you. + +=head3 Mouse + +If you try C<Moose> and find that one of these issues is preventing you +from using C<Moose>, we encourage you to consider L<Mouse> next. +C<Mouse> implements a subset of C<Moose>'s functionality in a simpler +package. For all features that it does implement, the end-user API is +I<identical> to C<Moose>, meaning you can switch from C<Mouse> to +C<Moose> quite easily. + +C<Mouse> does not implement most of C<Moose>'s introspection API, so +it's often faster when loading your modules. Additionally, all of its +I<required> dependencies ship with the Perl core, and it can run +without a compiler. If you do have a compiler, C<Mouse> will use it to +compile some of its code for a speed boost. + +Finally, it ships with a C<Mouse::Tiny> module that takes most of +C<Mouse>'s features and bundles them up in a single module file. You +can copy this module file into your application's library directory for +easy bundling. + +The C<Moose> authors hope that one day C<Mouse> can be made obsolete by +improving C<Moose> enough, but for now it provides a worthwhile +alternative to C<Moose>. + +=head2 Class::Accessor + +L<Class::Accessor> is the polar opposite of C<Moose>. It provides very +few features, nor is it self-hosting. + +It is, however, very simple, pure Perl, and it has no non-core +dependencies. It also provides a "Moose-like" API on demand for the +features it supports. + +Even though it doesn't do much, it is still preferable to writing your +own classes from scratch. + +Here's our C<File> class with C<Class::Accessor>: + + package File; + use Class::Accessor 'antlers'; + + has path => ( is => 'ro' ); + has content => ( is => 'ro' ); + has last_mod_time => ( is => 'ro' ); + + sub print_info { + my $self = shift; + + print "This file is at ", $self->path, "\n"; + } + +The C<antlers> import flag tells C<Class::Accessor> that you want to +define your attributes using C<Moose>-like syntax. The only parameter +that you can pass to C<has> is C<is>. We recommend that you use this +Moose-like syntax if you choose C<Class::Accessor> since it means you +will have a smoother upgrade path if you later decide to move to +C<Moose>. + +Like C<Moose>, C<Class::Accessor> generates accessor methods and a +constructor for your class. + +=head2 Object::Tiny + +Finally, we have L<Object::Tiny>. This module truly lives up to its +name. It has an incredibly minimal API and absolutely no dependencies +(core or not). Still, we think it's a lot easier to use than writing +your own OO code from scratch. + +Here's our C<File> class once more: + + package File; + use Object::Tiny qw( path content last_mod_time ); + + sub print_info { + my $self = shift; + + print "This file is at ", $self->path, "\n"; + } + +That's it! + +With C<Object::Tiny>, all accessors are read-only. It generates a +constructor for you, as well as the accessors you define. + +=head2 Role::Tiny + +As we mentioned before, roles provide an alternative to inheritance, +but Perl does not have any built-in role support. If you choose to use +Moose, it comes with a full-fledged role implementation. However, if +you use one of our other recommended OO modules, you can still use +roles with L<Role::Tiny> + +C<Role::Tiny> provides some of the same features as Moose's role +system, but in a much smaller package. Most notably, it doesn't support +any sort of attribute declaration, so you have to do that by hand. +Still, it's useful, and works well with C<Class::Accessor> and +C<Object::Tiny> + +=head2 OO System Summary + +Here's a brief recap of the options we covered: + +=over 4 + +=item * L<Moose> + +C<Moose> is the maximal option. It has a lot of features, a big +ecosystem, and a thriving user base. We also covered L<Mouse> briefly. +C<Mouse> is C<Moose> lite, and a reasonable alternative when Moose +doesn't work for your application. + +=item * L<Class::Accessor> + +C<Class::Accessor> does a lot less than C<Moose>, and is a nice +alternative if you find C<Moose> overwhelming. It's been around a long +time and is well battle-tested. It also has a minimal C<Moose> +compatibility mode which makes moving from C<Class::Accessor> to +C<Moose> easy. + +=item * L<Object::Tiny> + +C<Object::Tiny> is the absolute minimal option. It has no dependencies, +and almost no syntax to learn. It's a good option for a super minimal +environment and for throwing something together quickly without having +to worry about details. + +=item * L<Role::Tiny> + +Use C<Role::Tiny> with C<Class::Accessor> or C<Object::Tiny> if you +find yourself considering multiple inheritance. If you go with +C<Moose>, it comes with its own role implementation. + +=back + +=head2 Other OO Systems + +There are literally dozens of other OO-related modules on CPAN besides +those covered here, and you're likely to run across one or more of them +if you work with other people's code. + +In addition, plenty of code in the wild does all of its OO "by hand", +using just the Perl built-in OO features. If you need to maintain such +code, you should read L<perlobj> to understand exactly how Perl's +built-in OO works. + +=head1 CONCLUSION + +As we said before, Perl's minimal OO system has led to a profusion of +OO systems on CPAN. While you can still drop down to the bare metal and +write your classes by hand, there's really no reason to do that with +modern Perl. + +For small systems, L<Object::Tiny> and L<Class::Accessor> both provide +minimal object systems that take care of basic boilerplate for you. + +For bigger projects, L<Moose> provides a rich set of features that will +let you focus on implementing your business logic. + +We encourage you to play with and evaluate L<Moose>, +L<Class::Accessor>, and L<Object::Tiny> to see which OO system is right +for you. + +=cut diff --git a/gnu/usr.bin/perl/pod/perlopentut.pod b/gnu/usr.bin/perl/pod/perlopentut.pod index ea4b307b459..4bb43bffd76 100644 --- a/gnu/usr.bin/perl/pod/perlopentut.pod +++ b/gnu/usr.bin/perl/pod/perlopentut.pod @@ -55,7 +55,7 @@ If you prefer the low-punctuation version, you could write that this way: open RESULTS,"> runstats" or die "can't open runstats: $!"; open LOG, ">> logfile " or die "can't open logfile: $!"; -A few things to notice. First, the leading less-than is optional. +A few things to notice. First, the leading C<< < >> is optional. If omitted, Perl assumes that you want to open the file for reading. Note also that the first example uses the C<||> logical operator, and the @@ -117,13 +117,30 @@ like C<my $infile>, there's no clash and no need to worry about future conflicts. Another convenient behavior is that an indirect filehandle automatically -closes when it goes out of scope or when you undefine it: +closes when there are no more references to it: sub firstline { open( my $in, shift ) && return scalar <$in>; # no close() required } +Indirect filehandles also make it easy to pass filehandles to and return +filehandles from subroutines: + + for my $file ( qw(this.conf that.conf) ) { + my $fin = open_or_throw('<', $file); + process_conf( $fin ); + # no close() needed + } + + use Carp; + sub open_or_throw { + my ($mode, $filename) = @_; + open my $h, $mode, $filename + or croak "Could not open '$filename': $!"; + return $h; + } + =head2 Pipe Opens In C, when you want to open a file using the standard I/O library, diff --git a/gnu/usr.bin/perl/pod/perlpacktut.pod b/gnu/usr.bin/perl/pod/perlpacktut.pod index 7d2126a0eaa..2ce56622b75 100644 --- a/gnu/usr.bin/perl/pod/perlpacktut.pod +++ b/gnu/usr.bin/perl/pod/perlpacktut.pod @@ -73,14 +73,13 @@ remains. The inverse operation - packing byte contents from a string of hexadecimal digits - is just as easily written. For instance: - my $s = pack( 'H2' x 10, map { "3$_" } ( 0..9 ) ); + my $s = pack( 'H2' x 10, 30..39 ); print "$s\n"; Since we feed a list of ten 2-digit hexadecimal strings to C<pack>, the pack template should contain ten pack codes. If this is run on a computer with ASCII character coding, it will print C<0123456789>. - =head1 Packing Text Let's suppose you've got to read in a data file like this: diff --git a/gnu/usr.bin/perl/pod/perlperf.pod b/gnu/usr.bin/perl/pod/perlperf.pod index a934271088b..007a02bc876 100644 --- a/gnu/usr.bin/perl/pod/perlperf.pod +++ b/gnu/usr.bin/perl/pod/perlperf.pod @@ -30,7 +30,7 @@ optimization process. Firstly, you need to establish a baseline time for the existing code, which timing needs to be reliable and repeatable. You'll probably want to use the -C<Benchmark> or C<Devel::DProf> modules, or something similar, for this step, +C<Benchmark> or C<Devel::NYTProf> modules, or something similar, for this step, or perhaps the Unix system C<time> utility, whichever is appropriate. See the base of this document for a longer list of benchmarking and profiling modules, and recommended further reading. @@ -567,7 +567,7 @@ to execute, C<if ( $debug ) { > and C<my $message = shift;>, for example. The differences in the actual times recorded might be in the algorithm used internally, or it could be due to system resource limitations or contention. -See also the L<DBIx::Profiler> which will profile database queries running +See also the L<DBIx::Profile> which will profile database queries running under the C<DBIx::*> namespace. =head2 Devel::NYTProf @@ -597,7 +597,7 @@ the code. C<NYTProf> will generate a report database into the file F<nytprof.out> by default. Human readable reports can be generated from here by using the supplied C<nytprofhtml> (HTML output) and C<nytprofcsv> (CSV output) programs. -We've used the Unix sytem C<html2text> utility to convert the +We've used the Unix system C<html2text> utility to convert the F<nytprof/index.html> file for convenience here. $> html2text nytprof/index.html @@ -1137,7 +1137,7 @@ deserve further attention. Apache::DProf Apache::SmallProf Benchmark - DBIx::Profiler + DBIx::Profile Devel::AutoProfiler Devel::DProf Devel::DProfLB diff --git a/gnu/usr.bin/perl/pod/perlpodspec.pod b/gnu/usr.bin/perl/pod/perlpodspec.pod index 0bf84e09104..89fd9ba6f81 100644 --- a/gnu/usr.bin/perl/pod/perlpodspec.pod +++ b/gnu/usr.bin/perl/pod/perlpodspec.pod @@ -1,3 +1,4 @@ +=encoding utf8 =head1 NAME @@ -304,7 +305,7 @@ or data paragraphs. This is discussed in detail in the section L</About Data Paragraphs and "=beginE<sol>=end" Regions>. It is advised that formatnames match the regexp -C<m/\A:?[−a−zA−Z0−9_]+\z/>. Everything following whitespace after the +C<m/\A:?[-a-zA-Z0-9_]+\z/>. Everything following whitespace after the formatname is a parameter that may be used by the formatter when dealing with this region. This parameter must not be repeated in the "=end" paragraph. Implementors should anticipate future expansion in the @@ -1300,14 +1301,6 @@ browsers to decide. =item * -Authors wanting to link to a particular (absolute) URL, must do so -only with "LE<lt>scheme:...>" codes (like -LE<lt>http://www.perl.org>), and must not attempt "LE<lt>Some Site -Name|scheme:...>" codes. This restriction avoids many problems -in parsing and rendering LE<lt>...> codes. - -=item * - In a C<LE<lt>text|...E<gt>> code, text may contain formatting codes for formatting or for EE<lt>...> escapes, as in: diff --git a/gnu/usr.bin/perl/pod/perlpodstyle.pod b/gnu/usr.bin/perl/pod/perlpodstyle.pod new file mode 100644 index 00000000000..850f38dc8d9 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlpodstyle.pod @@ -0,0 +1,338 @@ +=head1 NAME + +perlpodstyle - Perl POD style guide + +=head1 DESCRIPTION + +These are general guidelines for how to write POD documentation for Perl +scripts and modules, based on general guidelines for writing good Unix man +pages. All of these guidelines are, of course, optional, but following +them will make your documentation more consistent with other documentation +on the system. + +Here are some simple guidelines for markup; see L<perlpod> for details. + +=over + +=item bold (BE<lt>E<gt>) + +B<NOTE: Use extremely rarely.> Do I<not> use bold for emphasis; that's +what italics are for. Restrict bold for notices like B<NOTE:> and +B<WARNING:>. However, program arguments and options--but I<not> their +names!--are written in bold (using BE<lt>E<gt>) to distinguish the B<-f> +command-line option from the C<-f> filetest operator. + +=item italic (IE<lt>E<gt>) + +Use italic to emphasize text, like I<this>. Function names are +traditionally written in italics; if you write a function as function(), +Pod::Man will take care of this for you. Names of programs, including the +name of the program being documented, are conventionally written in italics +(using IE<lt>E<gt>) wherever they occur in normal roman text. + +=item code (CE<lt>E<gt>) + +Literal code should be in CE<lt>E<gt>. However metasyntactic placeholders +should furthermore be nested in "italics" (actually, oblique) like +CE<lt>IE<lt>E<gt>E<gt>. That way +CE<lt>accept(IE<lt>NEWSOCKETE<gt>, E<lt>GENERICSOCKETE<gt>)E<gt> +renders as C<accept(I<NEWSOCKET>, I<GENERICSOCKET>)>. + +=item files (FE<lt>E<gt>) + +Filenames, whether absolute or relative, are specified with the FE<lt>E<gt> +markup. This will render as italics, but has other semantic connotations. + +=back + +References to other man pages should be in the form "manpage(section)" or +"C<LE<lt>manpage(section)E<gt>>", and Pod::Man will automatically format +those appropriately. Both will render as I<manpage>(section). The second +form, with LE<lt>E<gt>, is used to request that a POD formatter make a link +to the man page if possible. As an exception, one normally omits the +section when referring to module documentation because not all systems +place it in section 3, although that is the default. You may use +C<LE<lt>Module::NameE<gt>> for module references instead, but this is +optional because the translators are supposed to recognize module +references in pod, just as they do variable references like $foo and such. + +References to other programs or functions are normally in the form of man +page references so that cross-referencing tools can provide the user with +links and the like. It's possible to overdo this, though, so be careful not +to clutter your documentation with too much markup. References to other +programs that are not given as man page references should be enclosed in +italics via IE<lt>E<gt>. + +Major headers should be set out using a C<=head1> directive, and are +historically written in the rather startling ALL UPPER CASE format; this is +not mandatory, but it's strongly recommended so that sections have +consistent naming across different software packages. The translators are +supposed to translate all caps into small caps. Minor headers may be +included using C<=head2>, and are typically in mixed case. + +The standard sections of a manual page are: + +=over 4 + +=item NAME + +Mandatory section; should be a comma-separated list of programs or +functions documented by this POD page, such as: + + foo, bar - programs to do something + +Manual page indexers are often extremely picky about the format of this +section, so don't put anything in it except this line. Every program or +function documented by this POD page should be listed, separated by a +comma and a space. For a Perl module, just give the module name. A +single dash, and only a single dash, should separate the list of programs +or functions from the description. Do not use any markup such as +CE<lt>E<gt> or IE<lt>E<gt> anywhere in this line. Functions should not be +qualified with C<()> or the like. The description should ideally fit on a +single line, even if a man program replaces the dash with a few tabs. + +=item SYNOPSIS + +A short usage summary for programs and functions. This section is +mandatory for section 3 pages. For Perl module documentation, it's +usually convenient to have the contents of this section be a verbatim +block showing some (brief) examples of typical ways the module is used. + +=item DESCRIPTION + +Extended description and discussion of the program or functions, or the +body of the documentation for man pages that document something else. If +particularly long, it's a good idea to break this up into subsections +C<=head2> directives like: + + =head2 Normal Usage + + =head2 Advanced Features + + =head2 Writing Configuration Files + +or whatever is appropriate for your documentation. + +For a module, this is generally where the documentation of the interfaces +provided by the module goes, usually in the form of a list with an +C<=item> for each interface. Depending on how many interfaces there are, +you may want to put that documentation in separate METHODS, FUNCTIONS, +CLASS METHODS, or INSTANCE METHODS sections instead and save the +DESCRIPTION section for an overview. + +=item OPTIONS + +Detailed description of each of the command-line options taken by the +program. This should be separate from the description for the use of +parsers like L<Pod::Usage>. This is normally presented as a list, with +each option as a separate C<=item>. The specific option string should be +enclosed in BE<lt>E<gt>. Any values that the option takes should be +enclosed in IE<lt>E<gt>. For example, the section for the option +B<--section>=I<manext> would be introduced with: + + =item B<--section>=I<manext> + +Synonymous options (like both the short and long forms) are separated by a +comma and a space on the same C<=item> line, or optionally listed as their +own item with a reference to the canonical name. For example, since +B<--section> can also be written as B<-s>, the above would be: + + =item B<-s> I<manext>, B<--section>=I<manext> + +Writing the short option first is recommended because it's easier to read. +The long option is long enough to draw the eye to it anyway and the short +option can otherwise get lost in visual noise. + +=item RETURN VALUE + +What the program or function returns, if successful. This section can be +omitted for programs whose precise exit codes aren't important, provided +they return 0 on success and non-zero on failure as is standard. It +should always be present for functions. For modules, it may be useful to +summarize return values from the module interface here, or it may be more +useful to discuss return values separately in the documentation of each +function or method the module provides. + +=item ERRORS + +Exceptions, error return codes, exit statuses, and errno settings. +Typically used for function or module documentation; program documentation +uses DIAGNOSTICS instead. The general rule of thumb is that errors +printed to C<STDOUT> or C<STDERR> and intended for the end user are +documented in DIAGNOSTICS while errors passed internal to the calling +program and intended for other programmers are documented in ERRORS. When +documenting a function that sets errno, a full list of the possible errno +values should be given here. + +=item DIAGNOSTICS + +All possible messages the program can print out and what they mean. You +may wish to follow the same documentation style as the Perl documentation; +see perldiag(1) for more details (and look at the POD source as well). + +If applicable, please include details on what the user should do to +correct the error; documenting an error as indicating "the input buffer is +too small" without telling the user how to increase the size of the input +buffer (or at least telling them that it isn't possible) aren't very +useful. + +=item EXAMPLES + +Give some example uses of the program or function. Don't skimp; users +often find this the most useful part of the documentation. The examples +are generally given as verbatim paragraphs. + +Don't just present an example without explaining what it does. Adding a +short paragraph saying what the example will do can increase the value of +the example immensely. + +=item ENVIRONMENT + +Environment variables that the program cares about, normally presented as +a list using C<=over>, C<=item>, and C<=back>. For example: + + =over 6 + + =item HOME + + Used to determine the user's home directory. F<.foorc> in this + directory is read for configuration details, if it exists. + + =back + +Since environment variables are normally in all uppercase, no additional +special formatting is generally needed; they're glaring enough as it is. + +=item FILES + +All files used by the program or function, normally presented as a list, +and what it uses them for. File names should be enclosed in FE<lt>E<gt>. +It's particularly important to document files that will be potentially +modified. + +=item CAVEATS + +Things to take special care with, sometimes called WARNINGS. + +=item BUGS + +Things that are broken or just don't work quite right. + +=item RESTRICTIONS + +Bugs you don't plan to fix. :-) + +=item NOTES + +Miscellaneous commentary. + +=item AUTHOR + +Who wrote it (use AUTHORS for multiple people). It's a good idea to +include your current email address (or some email address to which bug +reports should be sent) or some other contact information so that users +have a way of contacting you. Remember that program documentation tends +to roam the wild for far longer than you expect and pick a contact method +that's likely to last. + +=item HISTORY + +Programs derived from other sources sometimes have this. Some people keep +a modification log here, but that usually gets long and is normally better +maintained in a separate file. + +=item COPYRIGHT AND LICENSE + +For copyright + + Copyright YEAR(s) YOUR NAME(s) + +(No, (C) is not needed. No, "all rights reserved" is not needed.) + +For licensing the easiest way is to use the same licensing as Perl itself: + + This library is free software; you may redistribute it and/or modify + it under the same terms as Perl itself. + +This makes it easy for people to use your module with Perl. Note that +this licensing example is neither an endorsement or a requirement, you are +of course free to choose any licensing. + +=item SEE ALSO + +Other man pages to check out, like man(1), man(7), makewhatis(8), or +catman(8). Normally a simple list of man pages separated by commas, or a +paragraph giving the name of a reference work. Man page references, if +they use the standard C<name(section)> form, don't have to be enclosed in +LE<lt>E<gt> (although it's recommended), but other things in this section +probably should be when appropriate. + +If the package has a mailing list, include a URL or subscription +instructions here. + +If the package has a web site, include a URL here. + +=back + +Documentation of object-oriented libraries or modules may want to use +CONSTRUCTORS and METHODS sections, or CLASS METHODS and INSTANCE METHODS +sections, for detailed documentation of the parts of the library and save +the DESCRIPTION section for an overview. Large modules with a function +interface may want to use FUNCTIONS for similar reasons. Some people use +OVERVIEW to summarize the description if it's quite long. + +Section ordering varies, although NAME must always be the first section +(you'll break some man page systems otherwise), and NAME, SYNOPSIS, +DESCRIPTION, and OPTIONS generally always occur first and in that order if +present. In general, SEE ALSO, AUTHOR, and similar material should be +left for last. Some systems also move WARNINGS and NOTES to last. The +order given above should be reasonable for most purposes. + +Some systems use CONFORMING TO to note conformance to relevant standards +and MT-LEVEL to note safeness for use in threaded programs or signal +handlers. These headings are primarily useful when documenting parts of a +C library. + +Finally, as a general note, try not to use an excessive amount of markup. +As documented here and in L<Pod::Man>, you can safely leave Perl variables, +module names, function names, man page references, and the like unadorned +by markup, and the POD translators will figure it all out for you. This +makes it much easier to later edit the documentation. Note that many +existing translators will do the wrong thing with email addresses when +wrapped in LE<lt>E<gt>, so don't do that. + +You can check whether your documentation looks right by running + + % pod2text -o something.pod | less + +If you have I<groff> installed, you can get an even better look this way: + + % pod2man something.pod | groff -Tps -mandoc > something.ps + +Now view the resulting Postscript file to see whether everything checks out. + +=head1 SEE ALSO + +For additional information that may be more accurate for your specific +system, see either L<man(5)> or L<man(7)> depending on your system manual +section numbering conventions. + +This documentation is maintained as part of the podlators distribution. +The current version is always available from its web site at +<http://www.eyrie.org/~eagle/software/podlators/>. + +=head1 AUTHOR + +Russ Allbery <rra@stanford.edu>, with large portions of this documentation +taken from the documentation of the original B<pod2man> implementation by +Larry Wall and Tom Christiansen. + +=head1 COPYRIGHT AND LICENSE + +Copyright 1999, 2000, 2001, 2004, 2006, 2008, 2010 Russ Allbery +<rra@stanford.edu>. + +This documentation is free software; you may redistribute it and/or modify +it under the same terms as Perl itself. + +=cut diff --git a/gnu/usr.bin/perl/pod/perlpolicy.pod b/gnu/usr.bin/perl/pod/perlpolicy.pod index d9a064271fb..7e713b4920a 100644 --- a/gnu/usr.bin/perl/pod/perlpolicy.pod +++ b/gnu/usr.bin/perl/pod/perlpolicy.pod @@ -1,6 +1,6 @@ =head1 NAME -perlpolicy - Various and sundry policies and commitments related to the perl core +perlpolicy - Various and sundry policies and commitments related to the Perl core =head1 DESCRIPTION @@ -8,6 +8,121 @@ This document is the master document which records all written policies about how the Perl 5 Porters collectively develop and maintain the Perl core. +=head1 GOVERNANCE + +=head2 Perl 5 Porters + +Subscribers to perl5-porters (the porters themselves) come in several flavours. +Some are quiet curious lurkers, who rarely pitch in and instead watch +the ongoing development to ensure they're forewarned of new changes or +features in Perl. Some are representatives of vendors, who are there +to make sure that Perl continues to compile and work on their +platforms. Some patch any reported bug that they know how to fix, +some are actively patching their pet area (threads, Win32, the regexp +-engine), while others seem to do nothing but complain. In other +words, it's your usual mix of technical people. + +Over this group of porters presides Larry Wall. He has the final word +in what does and does not change in any of the Perl programming languages. +These days, Larry spends most of his time on Perl 6, while Perl 5 is +shepherded by a "pumpking", a porter responsible for deciding what +goes into each release and ensuring that releases happen on a regular +basis. + +Larry sees Perl development along the lines of the US government: +there's the Legislature (the porters), the Executive branch (the +-pumpking), and the Supreme Court (Larry). The legislature can +discuss and submit patches to the executive branch all they like, but +the executive branch is free to veto them. Rarely, the Supreme Court +will side with the executive branch over the legislature, or the +legislature over the executive branch. Mostly, however, the +legislature and the executive branch are supposed to get along and +work out their differences without impeachment or court cases. + +You might sometimes see reference to Rule 1 and Rule 2. Larry's power +as Supreme Court is expressed in The Rules: + +=over 4 + +=item 1 + +Larry is always by definition right about how Perl should behave. +This means he has final veto power on the core functionality. + +=item 2 + +Larry is allowed to change his mind about any matter at a later date, +regardless of whether he previously invoked Rule 1. + +=back + +Got that? Larry is always right, even when he was wrong. It's rare +to see either Rule exercised, but they are often alluded to. + +=head1 MAINTENANCE AND SUPPORT + +Perl 5 is developed by a community, not a corporate entity. Every change +contributed to the Perl core is the result of a donation. Typically, these +donations are contributions of code or time by individual members of our +community. On occasion, these donations come in the form of corporate +or organizational sponsorship of a particular individual or project. + +As a volunteer organization, the commitments we make are heavily dependent +on the goodwill and hard work of individuals who have no obligation to +contribute to Perl. + +That being said, we value Perl's stability and security and have long +had an unwritten covenant with the broader Perl community to support +and maintain releases of Perl. + +This document codifies the support and maintenance commitments that +the Perl community should expect from Perl's developers: + +=over + +=item * + +We "officially" support the two most recent stable release series. 5.12.x +and earlier are now out of support. As of the release of 5.18.0, we will +"officially" end support for Perl 5.14.x, other than providing security +updates as described below. + +=item * + +To the best of our ability, we will attempt to fix critical issues +in the two most recent stable 5.x release series. Fixes for the +current release series take precedence over fixes for the previous +release series. + +=item * + +To the best of our ability, we will provide "critical" security patches +/ releases for any major version of Perl whose 5.x.0 release was within +the past three years. We can only commit to providing these for the +most recent .y release in any 5.x.y series. + +=item * + +We will not provide security updates or bug fixes for development +releases of Perl. + +=item * + +We encourage vendors to ship the most recent supported release of +Perl at the time of their code freeze. + +=item * + +As a vendor, you may have a requirement to backport security fixes +beyond our 3 year support commitment. We can provide limited support and +advice to you as you do so and, where possible will try to apply +those patches to the relevant -maint branches in git, though we may or +may not choose to make numbered releases or "official" patches +available. Contact us at E<lt>perl5-security-report@perl.orgE<gt> +to begin that process. + +=back + =head1 BACKWARD COMPATIBILITY AND DEPRECATION Our community has a long-held belief that backward-compatibility is a @@ -69,7 +184,7 @@ bug as a feature, we need to treat it as such. New syntax and semantics which don't break existing language constructs and syntax have a much lower bar. They merely need to prove themselves -to be useful, elegant, well designed and well tested. +to be useful, elegant, well designed, and well tested. =head2 Terminology @@ -90,7 +205,7 @@ an experimental feature useful and want to help shape its future. =item deprecated If something in the Perl core is marked as B<deprecated>, we may remove it -from thecore in the next stable release series, though we may not. As of +from the core in the next stable release series, though we may not. As of Perl 5.12, deprecated features and modules warn the user as they're used. If you use a deprecated feature and believe that its removal from the Perl core would be a mistake, please contact the perl5-porters mailinglist and @@ -104,12 +219,12 @@ From time to time, we may mark language constructs and features which we consider to have been mistakes as B<discouraged>. Discouraged features aren't candidates for removal in the next major release series, but we may later deprecate them if they're found to stand in the way of a -significant improvement to the core. +significant improvement to the Perl core. =item removed Once a feature, construct or module has been marked as deprecated for a -stable release cycle, we may remove it from the core. Unsurprisingly, +stable release cycle, we may remove it from the Perl core. Unsurprisingly, we say we've B<removed> these things. =back @@ -134,7 +249,9 @@ acceptable. =item * -Documentation updates are acceptable. +Acceptable documentation updates are those that correct factual errors, +explain significant bugs or deficiencies in the current implementation, +or fix broken markup. =item * @@ -154,11 +271,23 @@ rather than applied directly. =item * +Patches that fix regressions in perl's behavior relative to previous +releases are acceptable. + +=item * + Updates to dual-life modules should consist of minimal patches to fix crashing or security issues (as above). =item * +Minimal patches that fix platform-specific test failures or +installation issues are acceptable. When these changes are made +to dual-life modules for which CPAN is canonical, any changes +should be coordinated with the upstream author. + +=item * + New versions of dual-life modules should NOT be imported into maint. Those belong in the next stable series. @@ -213,7 +342,7 @@ the heart of Perl itself, is a joint project on the part of all of us. From time to time, a script, module, or set of modules (hereafter referred to simply as a "module") will prove so widely useful and/or so integral to the correct functioning of Perl itself that it should be distributed with -Perl core. This should never be done without the author's explicit +the Perl core. This should never be done without the author's explicit consent, and a clear recognition on all parts that this means the module is being distributed under the same terms as Perl itself. A module author should realize that inclusion of a module into the Perl core will @@ -230,7 +359,7 @@ gives up their ownership of it. In particular: =item * -The version of the module in the core should still be considered the +The version of the module in the Perl core should still be considered the work of the original author. All patches, bug reports, and so forth should be fed back to them. Their development directions should be respected whenever possible. @@ -269,11 +398,11 @@ As a last resort, however: If the author's vision of the future of their module is sufficiently different from the vision of the pumpkin holder and perl5-porters as a whole so as to cause serious problems for Perl, the pumpkin holder may -choose to formally fork the version of the module in the core from the +choose to formally fork the version of the module in the Perl core from the one maintained by the author. This should not be done lightly and should B<always> if at all possible be done only after direct input from Larry. If this is done, it must then be made explicit in the -module as distributed with Perl core that it is a forked version and +module as distributed with the Perl core that it is a forked version and that while it is based on the original author's work, it is no longer maintained by them. This must be noted in both the documentation and in the comments in the source of the module. @@ -306,7 +435,45 @@ necessary, and certainly no more drastic measure should be used until every avenue of communication and discussion has failed. +=head1 DOCUMENTATION + +Perl's documentation is an important resource for our users. It's +incredibly important for Perl's documentation to be reasonably coherent +and to accurately reflect the current implementation. + +Just as P5P collectively maintains the codebase, we collectively +maintain the documentation. Writing a particular bit of documentation +doesn't give an author control of the future of that documentation. +At the same time, just as source code changes should match the style +of their surrounding blocks, so should documentation changes. + +Examples in documentation should be illustrative of the concept +they're explaining. Sometimes, the best way to show how a +language feature works is with a small program the reader can +run without modification. More often, examples will consist +of a snippet of code containing only the "important" bits. +The definition of "important" varies from snippet to snippet. +Sometimes it's important to declare C<use strict> and C<use warnings>, +initialize all variables and fully catch every error condition. +More often than not, though, those things obscure the lesson +the example was intended to teach. + +As Perl is developed by a global team of volunteers, our +documentation often contains spellings which look funny +to I<somebody>. Choice of American/British/Other spellings +is left as an exercise for the author of each bit of +documentation. When patching documentation, try to emulate +the documentation around you, rather than changing the existing +prose. + +In general, documentation should describe what Perl does "now" rather +than what it used to do. It's perfectly reasonable to include notes +in documentation about how behaviour has changed from previous releases, +but, with very few exceptions, documentation isn't "dual-life" -- +it doesn't need to fully describe how all old versions used to work. + + =head1 CREDITS -Social Contract about Contributed Modules originally by Russ Allbery E<lt>rra@stanford.eduE<gt> and the perl5-porters. +"Social Contract about Contributed Modules" originally by Russ Allbery E<lt>rra@stanford.eduE<gt> and the perl5-porters. diff --git a/gnu/usr.bin/perl/pod/perlport.pod b/gnu/usr.bin/perl/pod/perlport.pod index 046afee7fe3..867b66e2915 100644 --- a/gnu/usr.bin/perl/pod/perlport.pod +++ b/gnu/usr.bin/perl/pod/perlport.pod @@ -480,17 +480,17 @@ file name. To convert $^X to a file pathname, taking account of the requirements of the various operating system possibilities, say: - use Config; - my $thisperl = $^X; - if ($^O ne 'VMS') - {$thisperl .= $Config{_exe} unless $thisperl =~ m/$Config{_exe}$/i;} + use Config; + my $thisperl = $^X; + if ($^O ne 'VMS') + {$thisperl .= $Config{_exe} unless $thisperl =~ m/$Config{_exe}$/i;} To convert $Config{perlpath} to a file pathname, say: - use Config; - my $thisperl = $Config{perlpath}; - if ($^O ne 'VMS') - {$thisperl .= $Config{_exe} unless $thisperl =~ m/$Config{_exe}$/i;} + use Config; + my $thisperl = $Config{perlpath}; + if ($^O ne 'VMS') + {$thisperl .= $Config{_exe} unless $thisperl =~ m/$Config{_exe}$/i;} =head2 Networking @@ -622,9 +622,9 @@ format. Don't assume that the epoch starts at 00:00:00, January 1, 1970, because that is OS- and implementation-specific. It is better to store a date in an unambiguous representation. The ISO 8601 standard -defines YYYY-MM-DD as the date format, or YYYY-MM-DDTHH-MM-SS +defines YYYY-MM-DD as the date format, or YYYY-MM-DDTHH:MM:SS (that's a literal "T" separating the date from the time). -Please do use the ISO 8601 instead of making us to guess what +Please do use the ISO 8601 instead of making us guess what date 02/03/04 might be. ISO 8601 even sorts nicely as-is. A text representation (like "1987-12-18") can be easily converted into an OS-specific value using a module like Date::Parse. @@ -689,10 +689,6 @@ If your code is destined for systems with severely constrained (or missing!) virtual memory systems then you want to be I<especially> mindful of avoiding wasteful constructs such as: - # NOTE: this is no longer "bad" in perl5.005 - for (0..10000000) {} # bad - for (my $x = 0; $x <= 10000000; ++$x) {} # good - my @lines = <$very_large_file>; # bad while (<$fh>) {$file .= $_} # sometimes bad @@ -771,11 +767,11 @@ Also see: =item * -Mailing list: cpan-testers@perl.org +Mailing list: cpan-testers-discuss@perl.org =item * -Testing results: http://testers.cpan.org/ +Testing results: L<http://www.cpantesters.org/> =back @@ -904,6 +900,8 @@ DOSish perls are as follows: Windows Vista MSWin32 MSWin32-x86 2 6 00 Windows 7 MSWin32 MSWin32-x86 2 6 01 Windows 7 MSWin32 MSWin32-x64 2 6 01 + Windows 2008 MSWin32 MSWin32-x86 2 6 01 + Windows 2008 MSWin32 MSWin32-x64 2 6 01 Windows CE MSWin32 ? 3 Cygwin cygwin cygwin @@ -929,13 +927,13 @@ Also see: =item * -The djgpp environment for DOS, http://www.delorie.com/djgpp/ +The djgpp environment for DOS, L<http://www.delorie.com/djgpp/> and L<perldos>. =item * The EMX environment for DOS, OS/2, etc. emx@iaehv.nl, -ftp://hobbes.nmsu.edu/pub/os2/dev/emx/ Also L<perlos2>. +L<ftp://hobbes.nmsu.edu/pub/os2/dev/emx/> Also L<perlos2>. =item * @@ -948,17 +946,17 @@ The C<Win32::*> modules in L<Win32>. =item * -The ActiveState Pages, http://www.activestate.com/ +The ActiveState Pages, L<http://www.activestate.com/> =item * The Cygwin environment for Win32; F<README.cygwin> (installed -as L<perlcygwin>), http://www.cygwin.com/ +as L<perlcygwin>), L<http://www.cygwin.com/> =item * The U/WIN environment for Win32, -http://www.research.att.com/sw/tools/uwin/ +L<http://www.research.att.com/sw/tools/uwin/> =item * @@ -1092,7 +1090,7 @@ Pumpkings and module integrators can easily see whether files with too many directory levels have snuck into the core by running the following in the top-level source directory: - $ perl -ne "$_=~s/\s+.*//; print if scalar(split /\//) > 8;" < MANIFEST + $ perl -ne "$_=~s/\s+.*//; print if scalar(split /\//) > 8;" < MANIFEST The VMS::Filespec module, which gets installed as part of the build @@ -1145,7 +1143,7 @@ Also see: =item * -F<README.vms> (installed as L<README_vms>), L<perlvms> +F<README.vms> (installed as F<README_vms>), L<perlvms> =item * @@ -1153,15 +1151,16 @@ vmsperl list, vmsperl-subscribe@perl.org =item * -vmsperl on the web, http://www.sidhe.org/vmsperl/index.html +vmsperl on the web, L<http://www.sidhe.org/vmsperl/index.html> =back =head2 VOS -Perl on VOS is discussed in F<README.vos> in the perl distribution -(installed as L<perlvos>). Perl on VOS can accept either VOS- or -Unix-style file specifications as in either of the following: +Perl on VOS (also known as OpenVOS) is discussed in F<README.vos> +in the perl distribution (installed as L<perlvos>). Perl on VOS +can accept either VOS- or Unix-style file specifications as in +either of the following: $ perl -ne "print if /perl_setup/i" >system>notices $ perl -ne "print if /perl_setup/i" /system/notices @@ -1172,15 +1171,35 @@ or even a mixture of both as in: Even though VOS allows the slash character to appear in object names, because the VOS port of Perl interprets it as a pathname -delimiting character, VOS files, directories, or links whose names -contain a slash character cannot be processed. Such files must be -renamed before they can be processed by Perl. Note that VOS limits -file names to 32 or fewer characters, file names cannot start with a -C<-> character, or contain any character matching C<< tr/ !%&'()*+;<>?// >> - -The value of C<$^O> on VOS is "VOS". To determine the architecture that -you are running on without resorting to loading all of C<%Config> you -can examine the content of the @INC array like so: +delimiting character, VOS files, directories, or links whose +names contain a slash character cannot be processed. Such files +must be renamed before they can be processed by Perl. + +Older releases of VOS (prior to OpenVOS Release 17.0) limit file +names to 32 or fewer characters, prohibit file names from +starting with a C<-> character, and prohibit file names from +containing any character matching C<< tr/ !#%&'()*;<=>?// >>. + +Newer releases of VOS (OpenVOS Release 17.0 or later) support a +feature known as extended names. On these releases, file names +can contain up to 255 characters, are prohibited from starting +with a C<-> character, and the set of prohibited characters is +reduced to any character matching C<< tr/#%*<>?// >>. There are +restrictions involving spaces and apostrophes: these characters +must not begin or end a name, nor can they immediately precede or +follow a period. Additionally, a space must not immediately +precede another space or hyphen. Specifically, the following +character combinations are prohibited: space-space, +space-hyphen, period-space, space-period, period-apostrophe, +apostrophe-period, leading or trailing space, and leading or +trailing apostrophe. Although an extended file name is limited +to 255 characters, a path name is still limited to 256 +characters. + +The value of C<$^O> on VOS is "VOS". To determine the +architecture that you are running on without resorting to loading +all of C<%Config> you can examine the content of the @INC array +like so: if ($^O =~ /VOS/) { print "I'm on a Stratus box!\n"; @@ -1202,13 +1221,13 @@ F<README.vos> (installed as L<perlvos>) The VOS mailing list. There is no specific mailing list for Perl on VOS. You can post -comments to the comp.sys.stratus newsgroup, or subscribe to the general -Stratus mailing list. Send a letter with "subscribe Info-Stratus" in -the message body to majordomo@list.stratagy.com. +comments to the comp.sys.stratus newsgroup, or use the contact +information located in the distribution files on the Stratus +Anonymous FTP site. =item * -VOS Perl on the web at http://ftp.stratus.com/pub/vos/posix/posix.html +VOS Perl on the web at L<http://ftp.stratus.com/pub/vos/posix/posix.html> =back @@ -1277,7 +1296,7 @@ The values of C<$^O> on some of these platforms includes: Some simple tricks for determining if you are running on an EBCDIC platform could include any of the following (perhaps all): - if ("\t" eq "\05") { print "EBCDIC may be spoken here!\n"; } + if ("\t" eq "\005") { print "EBCDIC may be spoken here!\n"; } if (ord('A') == 193) { print "EBCDIC may be spoken here!\n"; } @@ -1306,7 +1325,7 @@ general usage issues for all EBCDIC Perls. Send a message body of =item * AS/400 Perl information at -http://as400.rochester.ibm.com/ +L<http://as400.rochester.ibm.com/> as well as on CPAN in the F<ports/> directory. =back @@ -1373,8 +1392,8 @@ subdirectories named after the suffix. Hence files are translated: The Unix emulation library's translation of filenames to native assumes that this sort of translation is required, and it allows a user-defined list of known suffixes that it will transpose in this fashion. This may -seem transparent, but consider that with these rules C<foo/bar/baz.h> -and C<foo/bar/h/baz> both map to C<foo.bar.h.baz>, and that C<readdir> and +seem transparent, but consider that with these rules F<foo/bar/baz.h> +and F<foo/bar/h/baz> both map to F<foo.bar.h.baz>, and that C<readdir> and C<glob> cannot and do not attempt to emulate the reverse mapping. Other C<.>'s in filenames are translated to C</>. @@ -1449,12 +1468,12 @@ Be OS, F<README.beos> =item * HP 300 MPE/iX, F<README.mpeix> and Mark Bixby's web page -http://www.bixby.org/mark/porting.html +L<http://www.bixby.org/mark/porting.html> =item * A free perl5-based PERL.NLM for Novell Netware is available in -precompiled binary and source code form from http://www.novell.com/ +precompiled binary and source code form from L<http://www.novell.com/> as well as from CPAN. =item * @@ -1597,6 +1616,8 @@ Implemented via Spawn. (VM/ESA) Does not automatically flush output handles on some platforms. (SunOS, Solaris, HP-UX) +Not supported. (Symbian OS) + =item exit Emulates Unix exit() (which considers C<exit 1> to indicate an error) by @@ -1611,17 +1632,23 @@ enabled, a generic number will be encoded in a method compatible with the C library _POSIX_EXIT macro so that it can be decoded by other programs, particularly ones written in C, like the GNV package. (VMS) +C<exit()> resets file pointers, which is a problem when called +from a child process (created by C<fork()>) in C<BEGIN>. +A workaround is to use C<POSIX::_exit>. (Solaris) + + exit unless $Config{archname} =~ /\bsolaris\b/; + require POSIX and POSIX::_exit(0); + =item fcntl Not implemented. (Win32) + Some functions available based on the version of VMS. (VMS) =item flock Not implemented (VMS, S<RISC OS>, VOS). -Available only on Windows NT (not on Windows 95). (Win32) - =item fork Not implemented. (AmigaOS, S<RISC OS>, VM/ESA, VMS) @@ -1764,6 +1791,8 @@ because work arounds in the implementation use floating point numbers, it will become inaccurate as the time gets larger. This is a bug and will be fixed in the future. +On VOS, time values are 32-bit quantities. + =item ioctl FILEHANDLE,FUNCTION,SCALAR Not implemented. (VMS) @@ -1795,7 +1824,7 @@ numbers. (VMS) =item link -Not implemented. (MPE/iX, S<RISC OS>) +Not implemented. (MPE/iX, S<RISC OS>, VOS) Link count not updated because hard links are not quite that hard (They are sort of half-way between hard and soft links). (AmigaOS) @@ -1810,7 +1839,7 @@ Available on 64 bit OpenVMS 8.2 and later. (VMS) =item localtime -localtime() has the same range as L<gmtime>, but because time zone +localtime() has the same range as L</gmtime>, but because time zone rules change its accuracy for historical and future times may degrade but usually by no more than an hour. @@ -1845,6 +1874,12 @@ Not implemented. (Win32, VMS, S<RISC OS>) Can't move directories between directories on different logical volumes. (Win32) +=item rewinddir + +Will not cause readdir() to re-read the directory stream. The entries +already read before the rewinddir() call will just be returned again +from a cache buffer. (Win32) + =item select Only implemented on sockets. (Win32, VMS) @@ -1859,11 +1894,11 @@ Note that the C<select FILEHANDLE> form is generally portable. =item semop -Not implemented. ( Win32, VMS, S<RISC OS>, VOS) +Not implemented. (Win32, VMS, S<RISC OS>) =item setgrent -Not implemented. (MPE/iX, VMS, Win32, S<RISC OS>, VOS) +Not implemented. (MPE/iX, VMS, Win32, S<RISC OS>) =item setpgrp @@ -1875,7 +1910,7 @@ Not implemented. (Win32, VMS, S<RISC OS>, VOS) =item setpwent -Not implemented. (MPE/iX, Win32, S<RISC OS>, VOS) +Not implemented. (MPE/iX, Win32, S<RISC OS>) =item setsockopt @@ -1898,7 +1933,9 @@ be implemented even in Unix platforms. =item socketpair -Not implemented. (S<RISC OS>, VOS, VM/ESA) +Not implemented. (S<RISC OS>, VM/ESA) + +Available on OpenVOS Release 17.0 or later. (VOS) Available on 64 bit OpenVMS 8.2 and later. (VMS) @@ -2032,7 +2069,7 @@ Not useful. (S<RISC OS>) The following platforms are known to build Perl 5.12 (as of April 2010, its release date) from the standard source code distribution available -at http://www.cpan.org/src +at L<http://www.cpan.org/src> =over @@ -2080,6 +2117,8 @@ at http://www.cpan.org/src =item FreeBSD +=item Debian GNU/kFreeBSD + =item Haiku =item Irix (6.5. What else?) @@ -2088,6 +2127,8 @@ at http://www.cpan.org/src =item Dragonfly BSD +=item QNX Neutrino RTOS (6.5.0) + =item MirOS BSD Caveats: @@ -2098,16 +2139,15 @@ Caveats: =back - =item Symbian (Series 60 v3, 3.2 and 5 - what else?) -=item Stratus VOS +=item Stratus VOS / OpenVOS =item AIX =back -=head1 EOL Platforms (Perl 5.12) +=head1 EOL Platforms (Perl 5.14) The following platforms were supported by a previous version of Perl but have been officially removed from Perl's source code @@ -2125,9 +2165,8 @@ as of 5.12: =back -The following platforms may still work as of Perl 5.12, but Perl's -developers have made an explicit decision to discontinue support for -them: +The following platforms were supported up to 5.10. They may still +have worked in 5.12, but supporting code has been removed for 5.14: =over @@ -2145,7 +2184,7 @@ them: As of July 2002 (the Perl release 5.8.0), the following platforms were able to build Perl from the standard source code distribution -available at http://www.cpan.org/src/ +available at L<http://www.cpan.org/src/> AIX BeOS @@ -2250,7 +2289,7 @@ of any trouble. Unisys Dynix The following platforms have their own source code distributions and -binaries available via http://www.cpan.org/ports/ +binaries available via L<http://www.cpan.org/ports/> Perl release @@ -2258,7 +2297,7 @@ binaries available via http://www.cpan.org/ports/ Tandem Guardian 5.004 The following platforms have only binaries available via -http://www.cpan.org/ports/index.html : +L<http://www.cpan.org/ports/index.html> : Perl release @@ -2269,11 +2308,11 @@ http://www.cpan.org/ports/index.html : Although we do suggest that you always build your own Perl from the source code, both for maximal configurability and for security, in case you are in a hurry you can check -http://www.cpan.org/ports/index.html for binary distributions. +L<http://www.cpan.org/ports/index.html> for binary distributions. =head1 SEE ALSO -L<perlaix>, L<perlamiga>, L<perlapollo>, L<perlbeos>, L<perlbs2000>, +L<perlaix>, L<perlamiga>, L<perlbeos>, L<perlbs2000>, L<perlce>, L<perlcygwin>, L<perldgux>, L<perldos>, L<perlepoc>, L<perlebcdic>, L<perlfreebsd>, L<perlhurd>, L<perlhpux>, L<perlirix>, L<perlmacos>, L<perlmacosx>, L<perlmpeix>, @@ -2316,5 +2355,5 @@ Gurusamy Sarathy <gsar@activestate.com>, Paul J. Schinder <schinder@pobox.com>, Michael G Schwern <schwern@pobox.com>, Dan Sugalski <dan@sidhe.org>, -Nathan Torkington <gnat@frii.com>. +Nathan Torkington <gnat@frii.com>, John Malmberg <wb8tyw@qsl.net> diff --git a/gnu/usr.bin/perl/pod/perlpragma.pod b/gnu/usr.bin/perl/pod/perlpragma.pod index 856014438e5..604387d9f97 100644 --- a/gnu/usr.bin/perl/pod/perlpragma.pod +++ b/gnu/usr.bin/perl/pod/perlpragma.pod @@ -82,17 +82,17 @@ The interaction with the Perl compilation happens inside package C<myint>: use warnings; sub import { - $^H{myint} = 1; + $^H{"myint/in_effect"} = 1; } sub unimport { - $^H{myint} = 0; + $^H{"myint/in_effect"} = 0; } sub in_effect { my $level = shift // 0; my $hinthash = (caller($level))[10]; - return $hinthash->{myint}; + return $hinthash->{"myint/in_effect"}; } 1; @@ -122,10 +122,26 @@ at index 10 of the list of returned results. In the example pragma, retrieval is encapsulated into the routine C<in_effect()>, which takes as parameter the number of call frames to go up to find the value of the pragma in the user's script. This uses C<caller()> to determine the value of -C<$^H{myint}> when each line of the user's script was called, and +C<$^H{"myint/in_effect"}> when each line of the user's script was called, and therefore provide the correct semantics in the subroutine implementing the overloaded addition. +=head1 Key naming + +There is only a single C<%^H>, but arbitrarily many modules that want +to use its scoping semantics. To avoid stepping on each other's toes, +they need to be sure to use different keys in the hash. It is therefore +conventional for a module to use only keys that begin with the module's +name (the name of its main package) and a "/" character. After this +module-identifying prefix, the rest of the key is entirely up to the +module: it may include any characters whatsoever. For example, a module +C<Foo::Bar> should use keys such as C<Foo::Bar/baz> and C<Foo::Bar/$%/_!>. +Modules following this convention all play nicely with each other. + +The Perl core uses a handful of keys in C<%^H> which do not follow this +convention, because they predate it. Keys that follow the convention +won't conflict with the core's historical keys. + =head1 Implementation details The optree is shared between threads. This means there is a possibility that diff --git a/gnu/usr.bin/perl/pod/perlreapi.pod b/gnu/usr.bin/perl/pod/perlreapi.pod index d1d947b8a74..5e456208684 100644 --- a/gnu/usr.bin/perl/pod/perlreapi.pod +++ b/gnu/usr.bin/perl/pod/perlreapi.pod @@ -120,21 +120,28 @@ TODO: Document those cases. =item C</p> - RXf_PMf_KEEPCOPY +TODO: Document this + +=item Character set + +The character set semantics are determined by an enum that is contained +in this field. This is still experimental and subject to change, but +the current interface returns the rules by use of the in-line function +C<get_regex_charset(const U32 flags)>. The only currently documented +value returned from it is REGEX_LOCALE_CHARSET, which is set if +C<use locale> is in effect. If present in C<< rx->extflags >>, +C<split> will use the locale dependent definition of whitespace +when RXf_SKIPWHITE or RXf_WHITE is in effect. ASCII whitespace +is defined as per L<isSPACE|perlapi/isSPACE>, and by the internal +macros C<is_utf8_space> under UTF-8, and C<isSPACE_LC> under C<use +locale>. + =back Additional flags: =over 4 -=item RXf_PMf_LOCALE - -Set if C<use locale> is in effect. If present in C<< rx->extflags >> -C<split> will use the locale dependent definition of whitespace under -when RXf_SKIPWHITE or RXf_WHITE are in effect. Under ASCII whitespace -is defined as per L<isSPACE|perlapi/ISSPACE>, and by the internal -macros C<is_utf8_space> under UTF-8 and C<isSPACE_LC> under C<use -locale>. - =item RXf_UTF8 Set if the pattern is L<SvUTF8()|perlapi/SvUTF8>, set by Perl_pmruntime. @@ -243,7 +250,7 @@ perl will handle releasing anything else contained in the regexp structure. Called to get/set the value of C<$`>, C<$'>, C<$&> and their named equivalents, ${^PREMATCH}, ${^POSTMATCH} and $^{MATCH}, as well as the -numbered capture buffers (C<$1>, C<$2>, ...). +numbered capture groups (C<$1>, C<$2>, ...). The C<paren> parameter will be C<-2> for C<$`>, C<-1> for C<$'>, C<0> for C<$&>, C<1> for C<$1> and so forth. @@ -282,7 +289,7 @@ sure this is used as the new value (or reject it). Example: if ("ook" =~ /(o*)/) { - # `paren' will be `1' and `value' will be `ee' + # 'paren' will be '1' and 'value' will be 'ee' $1 =~ tr/o/e/; } @@ -317,7 +324,7 @@ behave in the same situation: package main; - tie my $sv => "CatptureVar"; + tie my $sv => "CaptureVar"; $sv =~ y/a/b/; Because C<$sv> is C<undef> when the C<y///> operator is applied to it @@ -492,7 +499,7 @@ values. in the final match, used for optimisations */ struct reg_substr_data *substrs; - U32 nparens; /* number of capture buffers */ + U32 nparens; /* number of capture groups */ /* private engine specific data */ U32 intflags; /* Engine Specific Internal flags */ @@ -579,7 +586,7 @@ Substring data about strings that must appear in the final match. This is currently only used internally by perl's engine for but might be used in the future for all engines for optimisations. -=head2 C<nparens>, C<lasparen>, and C<lastcloseparen> +=head2 C<nparens>, C<lastparen>, and C<lastcloseparen> These fields are used to keep track of how many paren groups could be matched in the pattern, which was the last open paren to be entered, and which was @@ -612,7 +619,7 @@ C<regexp_paren_pair> struct is defined as follows: } regexp_paren_pair; If C<< ->offs[num].start >> or C<< ->offs[num].end >> is C<-1> then that -capture buffer did not match. C<< ->offs[0].start/end >> represents C<$&> (or +capture group did not match. C<< ->offs[0].start/end >> represents C<$&> (or C<${^MATCH> under C<//p>) and C<< ->offs[paren].end >> matches C<$$paren> where C<$paren >= 1>. @@ -633,7 +640,7 @@ The relevant snippet from C<Perl_pp_regcomp>: =head2 C<paren_names> -This is a hash used internally to track named capture buffers and their +This is a hash used internally to track named capture groups and their offsets. The keys are the names of the buffers the values are dualvars, with the IV slot holding the number of buffers with the given name and the pv being an embedded array of I32. The values may also be contained @@ -655,7 +662,7 @@ Used during execution phase for managing search and replace patterns. =head2 C<wrapped> C<wraplen> Stores the string C<qr//> stringifies to. The perl engine for example -stores C<(?-xism:eek)> in the case of C<qr/eek/>. +stores C<(?^:eek)> in the case of C<qr/eek/>. When using a custom engine that doesn't support the C<(?:)> construct for inline modifiers, it's probably best to have C<qr//> stringify to diff --git a/gnu/usr.bin/perl/pod/perlrebackslash.pod b/gnu/usr.bin/perl/pod/perlrebackslash.pod index 5ff26019265..f81af0c6dd7 100644 --- a/gnu/usr.bin/perl/pod/perlrebackslash.pod +++ b/gnu/usr.bin/perl/pod/perlrebackslash.pod @@ -16,7 +16,6 @@ Most sequences are described in detail in different documents; the primary purpose of this document is to have a quick reference guide describing all backslash and escape sequences. - =head2 The backslash In a regular expression, the backslash can perform one of two tasks: @@ -26,14 +25,14 @@ or it is the start of a backslash or escape sequence. The rules determining what it is are quite simple: if the character following the backslash is an ASCII punctuation (non-word) character (that is, -anything that is not a letter, digit or underscore), then the backslash just -takes away the special meaning (if any) of the character following it. +anything that is not a letter, digit, or underscore), then the backslash just +takes away any special meaning of the character following it. If the character following the backslash is an ASCII letter or an ASCII digit, then the sequence may be special; if so, it's listed below. A few letters have not been used yet, so escaping them with a backslash doesn't change them to be special. A future version of Perl may assign a special meaning to them, so if -you have warnings turned on, Perl will issue a warning if you use such a +you have warnings turned on, Perl issues a warning if you use such a sequence. [1]. It is however guaranteed that backslash or escape sequences never have a @@ -49,9 +48,9 @@ backslash. =item [1] -There is one exception. If you use an alphanumerical character as the +There is one exception. If you use an alphanumeric character as the delimiter of your pattern (which you probably shouldn't do for readability -reasons), you will have to escape the delimiter if you want to match +reasons), you have to escape the delimiter if you want to match it. Perl won't warn then. See also L<perlop/Gory details of parsing quoted constructs>. @@ -63,20 +62,21 @@ quoted constructs>. Those not usable within a bracketed character class (like C<[\da-z]>) are marked as C<Not in [].> - \000 Octal escape sequence. + \000 Octal escape sequence. See also \o{}. \1 Absolute backreference. Not in []. \a Alarm or bell. \A Beginning of string. Not in []. \b Word/non-word boundary. (Backspace in []). \B Not a word/non-word boundary. Not in []. - \cX Control-X (X can be any ASCII character). + \cX Control-X \C Single octet, even under UTF-8. Not in []. \d Character class for digits. \D Character class for non-digits. \e Escape character. \E Turn off \Q, \L and \U processing. Not in []. \f Form feed. - \g{}, \g1 Named, absolute or relative backreference. Not in []. + \F Foldcase till \E. Not in []. + \g{}, \g1 Named, absolute or relative backreference. Not in [] \G Pos assertion. Not in []. \h Character class for horizontal whitespace. \H Character class for non horizontal whitespace. @@ -86,10 +86,12 @@ as C<Not in [].> \L Lowercase till \E. Not in []. \n (Logical) newline character. \N Any character but newline. Experimental. Not in []. - \N{} Named or numbered (Unicode) character. + \N{} Named or numbered (Unicode) character or sequence. + \o{} Octal escape sequence. \p{}, \pP Character with the given Unicode property. \P{}, \PP Character without the given Unicode property. - \Q Quotemeta till \E. Not in []. + \Q Quote (disable) pattern metacharacters till \E. Not + in []. \r Return character. \R Generic new line. Not in []. \s Character class for whitespace. @@ -112,9 +114,10 @@ as C<Not in [].> A handful of characters have a dedicated I<character escape>. The following table shows them, along with their ASCII code points (in decimal and hex), -their ASCII name, the control escape (see below) and a short description. +their ASCII name, the control escape on ASCII platforms and a short +description. (For EBCDIC platforms, see L<perlebcdic/OPERATOR DIFFERENCES>.) - Seq. Code Point ASCII Cntr Description. + Seq. Code Point ASCII Cntrl Description. Dec Hex \a 7 07 BEL \cG alarm or bell \b 8 08 BS \cH backspace [1] @@ -128,13 +131,13 @@ their ASCII name, the control escape (see below) and a short description. =item [1] -C<\b> is only the backspace character inside a character class. Outside a +C<\b> is the backspace character only inside a character class. Outside a character class, C<\b> is a word/non-word boundary. =item [2] -C<\n> matches a logical newline. Perl will convert between C<\n> and your -OSses native newline character when reading from or writing to text files. +C<\n> matches a logical newline. Perl converts between C<\n> and your +OS's native newline character when reading from or writing to text files. =back @@ -145,10 +148,18 @@ OSses native newline character when reading from or writing to text files. =head3 Control characters C<\c> is used to denote a control character; the character following C<\c> -is the name of the control character. For instance, C</\cM/> matches the -character I<control-M> (a carriage return, code point 13). The case of the -character following C<\c> doesn't matter: C<\cM> and C<\cm> match the same -character. +determines the value of the construct. For example the value of C<\cA> is +C<chr(1)>, and the value of C<\cb> is C<chr(2)>, etc. +The gory details are in L<perlop/"Regexp Quote-Like Operators">. A complete +list of what C<chr(1)>, etc. means for ASCII and EBCDIC platforms is in +L<perlebcdic/OPERATOR DIFFERENCES>. + +Note that C<\c\> alone at the end of a regular expression (or doubled-quoted +string) is not valid. The backslash must be followed by another character. +That is, C<\c\I<X>> means C<chr(28) . 'I<X>'> for all characters I<X>. + +To write platform-independent code, you must use C<\N{I<NAME>}> instead, like +C<\N{ESCAPE}> or C<\N{U+001B}>, see L<charnames>. Mnemonic: I<c>ontrol character. @@ -156,42 +167,41 @@ Mnemonic: I<c>ontrol character. $str =~ /\cK/; # Matches if $str contains a vertical tab (control-K). -=head3 Named or numbered characters +=head3 Named or numbered characters and character sequences -All Unicode characters have a Unicode name and numeric ordinal value. Use the +Unicode characters have a Unicode name and numeric code point (ordinal) +value. Use the C<\N{}> construct to specify a character by either of these values. +Certain sequences of characters also have names. -To specify by name, the name of the character goes between the curly braces. -In this case, you have to C<use charnames> to load the Unicode names of the -characters, otherwise Perl will complain. +To specify by name, the name of the character or character sequence goes +between the curly braces. -To specify by Unicode ordinal number, use the form -C<\N{U+I<wide hex character>}>, where I<wide hex character> is a number in -hexadecimal that gives the ordinal number that Unicode has assigned to the -desired character. It is customary (but not required) to use leading zeros to -pad the number to 4 digits. Thus C<\N{U+0041}> means -C<Latin Capital Letter A>, and you will rarely see it written without the two -leading zeros. C<\N{U+0041}> means C<A> even on EBCDIC machines (where the -ordinal value of C<A> is not 0x41). +To specify a character by Unicode code point, use the form C<\N{U+I<code +point>}>, where I<code point> is a number in hexadecimal that gives the +code point that Unicode has assigned to the desired character. It is +customary but not required to use leading zeros to pad the number to 4 +digits. Thus C<\N{U+0041}> means C<LATIN CAPITAL LETTER A>, and you will +rarely see it written without the two leading zeros. C<\N{U+0041}> means +"A" even on EBCDIC machines (where the ordinal value of "A" is not 0x41). -It is even possible to give your own names to characters, and even to short -sequences of characters. For details, see L<charnames>. +It is even possible to give your own names to characters and character +sequences. For details, see L<charnames>. (There is an expanded internal form that you may see in debug output: -C<\N{U+I<wide hex character>.I<wide hex character>...}>. -The C<...> means any number of these I<wide hex character>s separated by dots. +C<\N{U+I<code point>.I<code point>...}>. +The C<...> means any number of these I<code point>s separated by dots. This represents the sequence formed by the characters. This is an internal form only, subject to change, and you should not try to use it yourself.) Mnemonic: I<N>amed character. -Note that a character that is expressed as a named or numbered character is -considered as a character without special meaning by the regex engine, and will -match "as is". +Note that a character or character sequence expressed as a named +or numbered character is considered a character without special +meaning by the regex engine, and will match "as is". =head4 Example - use charnames ':full'; # Loads the Unicode names. $str =~ /\N{THAI CHARACTER SO SO}/; # Matches the Thai SO SO character use charnames 'Cyrillic'; # Loads Cyrillic names. @@ -199,30 +209,56 @@ match "as is". =head3 Octal escapes -Octal escapes consist of a backslash followed by two or three octal digits -matching the code point of the character you want to use. This allows for -512 characters (C<\00> up to C<\777>) that can be expressed this way (but -anything above C<\377> is deprecated). -Enough in pre-Unicode days, but most Unicode characters cannot be escaped -this way. - -Note that a character that is expressed as an octal escape is considered -as a character without special meaning by the regex engine, and will match +There are two forms of octal escapes. Each is used to specify a character by +its code point specified in octal notation. + +One form, available starting in Perl 5.14 looks like C<\o{...}>, where the dots +represent one or more octal digits. It can be used for any Unicode character. + +It was introduced to avoid the potential problems with the other form, +available in all Perls. That form consists of a backslash followed by three +octal digits. One problem with this form is that it can look exactly like an +old-style backreference (see +L</Disambiguation rules between old-style octal escapes and backreferences> +below.) You can avoid this by making the first of the three digits always a +zero, but that makes \077 the largest code point specifiable. + +In some contexts, a backslash followed by two or even one octal digits may be +interpreted as an octal escape, sometimes with a warning, and because of some +bugs, sometimes with surprising results. Also, if you are creating a regex +out of smaller snippets concatenated together, and you use fewer than three +digits, the beginning of one snippet may be interpreted as adding digits to the +ending of the snippet before it. See L</Absolute referencing> for more +discussion and examples of the snippet problem. + +Note that a character expressed as an octal escape is considered +a character without special meaning by the regex engine, and will match "as is". -=head4 Examples (assuming an ASCII platform) +To summarize, the C<\o{}> form is always safe to use, and the other form is +safe to use for code points through \077 when you use exactly three digits to +specify them. - $str = "Perl"; - $str =~ /\120/; # Match, "\120" is "P". - $str =~ /\120+/; # Match, "\120" is "P", it is repeated at least once. - $str =~ /P\053/; # No match, "\053" is "+" and taken literally. +Mnemonic: I<0>ctal or I<o>ctal. -=head4 Caveat +=head4 Examples (assuming an ASCII platform) -Octal escapes potentially clash with backreferences. They both consist -of a backslash followed by numbers. So Perl has to use heuristics to -determine whether it is a backreference or an octal escape. Perl uses -the following rules: + $str = "Perl"; + $str =~ /\o{120}/; # Match, "\120" is "P". + $str =~ /\120/; # Same. + $str =~ /\o{120}+/; # Match, "\120" is "P", it's repeated at least once + $str =~ /\120+/; # Same. + $str =~ /P\053/; # No match, "\053" is "+" and taken literally. + /\o{23073}/ # Black foreground, white background smiling face. + /\o{4801234567}/ # Raises a warning, and yields chr(4) + +=head4 Disambiguation rules between old-style octal escapes and backreferences + +Octal escapes of the C<\000> form outside of bracketed character classes +potentially clash with old-style backreferences. (see L</Absolute referencing> +below). They both consist of a backslash followed by numbers. So Perl has to +use heuristics to determine whether it is a backreference or an octal escape. +Perl uses the following rules to disambiguate: =over 4 @@ -236,30 +272,35 @@ If the first digit following the backslash is a 0, it's an octal escape. =item 3 -If the number following the backslash is N (decimal), and Perl already has -seen N capture groups, Perl will consider this to be a backreference. -Otherwise, it will consider it to be an octal escape. Note that if N > 999, -Perl only takes the first three digits for the octal escape; the rest is -matched as is. +If the number following the backslash is N (in decimal), and Perl already +has seen N capture groups, Perl considers this a backreference. Otherwise, +it considers it an octal escape. If N has more than three digits, Perl +takes only the first three for the octal escape; the rest are matched as is. my $pat = "(" x 999; $pat .= "a"; $pat .= ")" x 999; /^($pat)\1000$/; # Matches 'aa'; there are 1000 capture groups. /^$pat\1000$/; # Matches 'a@0'; there are 999 capture groups - # and \1000 is seen as \100 (a '@') and a '0'. + # and \1000 is seen as \100 (a '@') and a '0' =back +You can force a backreference interpretation always by using the C<\g{...}> +form. You can the force an octal interpretation always by using the C<\o{...}> +form, or for numbers up through \077 (= 63 decimal), by using three digits, +beginning with a "0". + =head3 Hexadecimal escapes -Hexadecimal escapes start with C<\x> and are then either followed by a -two digit hexadecimal number, or a hexadecimal number of arbitrary length -surrounded by curly braces. The hexadecimal number is the code point of -the character you want to express. +Like octal escapes, there are two forms of hexadecimal escapes, but both start +with the same thing, C<\x>. This is followed by either exactly two hexadecimal +digits forming a number, or a hexadecimal number of arbitrary length surrounded +by curly braces. The hexadecimal number is the code point of the character you +want to express. -Note that a character that is expressed as a hexadecimal escape is considered -as a character without special meaning by the regex engine, and will match +Note that a character expressed as one of these escapes is considered a +character without special meaning by the regex engine, and will match "as is". Mnemonic: heI<x>adecimal. @@ -268,7 +309,7 @@ Mnemonic: heI<x>adecimal. $str = "Perl"; $str =~ /\x50/; # Match, "\x50" is "P". - $str =~ /\x50+/; # Match, "\x50" is "P", it is repeated at least once. + $str =~ /\x50+/; # Match, "\x50" is "P", it is repeated at least once $str =~ /P\x2B/; # No match, "\x2B" is "+" and taken literally. /\x{2603}\x{2602}/ # Snowman with an umbrella. @@ -282,22 +323,29 @@ Mnemonic: heI<x>adecimal. A number of backslash sequences have to do with changing the character, or characters following them. C<\l> will lowercase the character following it, while C<\u> will uppercase (or, more accurately, titlecase) the -character following it. (They perform similar functionality as the -functions C<lcfirst> and C<ucfirst>). +character following it. They provide functionality similar to the +functions C<lcfirst> and C<ucfirst>. To uppercase or lowercase several characters, one might want to use C<\L> or C<\U>, which will lowercase/uppercase all characters following -them, until either the end of the pattern, or the next occurrence of -C<\E>, whatever comes first. They perform similar functionality as the -functions C<lc> and C<uc> do. +them, until either the end of the pattern or the next occurrence of +C<\E>, whichever comes first. They provide functionality similar to what +the functions C<lc> and C<uc> provide. -C<\Q> is used to escape all characters following, up to the next C<\E> -or the end of the pattern. C<\Q> adds a backslash to any character that -isn't a letter, digit or underscore. This will ensure that any character -between C<\Q> and C<\E> is matched literally, and will not be interpreted -by the regexp engine. +C<\Q> is used to quote (disable) pattern metacharacters, up to the next +C<\E> or the end of the pattern. C<\Q> adds a backslash to any character +that could have special meaning to Perl. In the ASCII range, it quotes +every character that isn't a letter, digit, or underscore. See +L<perlfunc/quotemeta> for details on what gets quoted for non-ASCII +code points. Using this ensures that any character between C<\Q> and +C<\E> will be matched literally, not interpreted as a metacharacter by +the regex engine. -Mnemonic: I<L>owercase, I<U>ppercase, I<Q>uotemeta, I<E>nd. +C<\F> can be used to casefold all characters following, up to the next C<\E> +or the end of the pattern. It provides the functionality similar to +the C<fc> function. + +Mnemonic: I<L>owercase, I<U>ppercase, I<F>old-case, I<Q>uotemeta, I<E>nd. =head4 Examples @@ -316,15 +364,22 @@ the character classes are written as a backslash sequence. We will briefly discuss those here; full details of character classes can be found in L<perlrecharclass>. -C<\w> is a character class that matches any single I<word> character (letters, -digits, underscore). C<\d> is a character class that matches any decimal digit, -while the character class C<\s> matches any whitespace character. +C<\w> is a character class that matches any single I<word> character +(letters, digits, Unicode marks, and connector punctuation (like the +underscore)). C<\d> is a character class that matches any decimal +digit, while the character class C<\s> matches any whitespace character. New in perl 5.10.0 are the classes C<\h> and C<\v> which match horizontal and vertical whitespace characters. +The exact set of characters matched by C<\d>, C<\s>, and C<\w> varies +depending on various pragma and regular expression modifiers. It is +possible to restrict the match to the ASCII range by using the C</a> +regular expression modifier. See L<perlrecharclass>. + The uppercase variants (C<\W>, C<\D>, C<\S>, C<\H>, and C<\V>) are -character classes that match any character that isn't a word character, -digit, whitespace, horizontal whitespace nor vertical whitespace. +character classes that match, respectively, any character that isn't a +word character, digit, whitespace, horizontal whitespace, or vertical +whitespace. Mnemonics: I<w>ord, I<d>igit, I<s>pace, I<h>orizontal, I<v>ertical. @@ -335,12 +390,11 @@ match a character that matches the given Unicode property; properties include things like "letter", or "thai character". Capitalizing the sequence to C<\PP> and C<\P{Property}> make the sequence match a character that doesn't match the given Unicode property. For more details, see -L<perlrecharclass/Backslashed sequences> and +L<perlrecharclass/Backslash sequences> and L<perlunicode/Unicode Character Properties>. Mnemonic: I<p>roperty. - =head2 Referencing If capturing parenthesis are used in a regular expression, we can refer @@ -352,41 +406,51 @@ absolutely, relatively, and by name. =head3 Absolute referencing -A backslash sequence that starts with a backslash and is followed by a -number is an absolute reference (but be aware of the caveat mentioned above). -If the number is I<N>, it refers to the Nth set of parentheses - whatever -has been matched by that set of parenthesis has to be matched by the C<\N> -as well. +Either C<\gI<N>> (starting in Perl 5.10.0), or C<\I<N>> (old-style) where I<N> +is a positive (unsigned) decimal number of any length is an absolute reference +to a capturing group. + +I<N> refers to the Nth set of parentheses, so C<\gI<N>> refers to whatever has +been matched by that set of parentheses. Thus C<\g1> refers to the first +capture group in the regex. + +The C<\gI<N>> form can be equivalently written as C<\g{I<N>}> +which avoids ambiguity when building a regex by concatenating shorter +strings. Otherwise if you had a regex C<qr/$a$b/>, and C<$a> contained +C<"\g1">, and C<$b> contained C<"37">, you would get C</\g137/> which is +probably not what you intended. + +In the C<\I<N>> form, I<N> must not begin with a "0", and there must be at +least I<N> capturing groups, or else I<N> is considered an octal escape +(but something like C<\18> is the same as C<\0018>; that is, the octal escape +C<"\001"> followed by a literal digit C<"8">). + +Mnemonic: I<g>roup. =head4 Examples - /(\w+) \1/; # Finds a duplicated word, (e.g. "cat cat"). - /(.)(.)\2\1/; # Match a four letter palindrome (e.g. "ABBA"). + /(\w+) \g1/; # Finds a duplicated word, (e.g. "cat cat"). + /(\w+) \1/; # Same thing; written old-style + /(.)(.)\g2\g1/; # Match a four letter palindrome (e.g. "ABBA"). =head3 Relative referencing -New in perl 5.10.0 is a different way of referring to capture buffers: C<\g>. -C<\g> takes a number as argument, with the number in curly braces (the -braces are optional). If the number (N) does not have a sign, it's a reference -to the Nth capture group (so C<\g{2}> is equivalent to C<\2> - except that -C<\g> always refers to a capture group and will never be seen as an octal -escape). If the number is negative, the reference is relative, referring to -the Nth group before the C<\g{-N}>. +C<\g-I<N>> (starting in Perl 5.10.0) is used for relative addressing. (It can +be written as C<\g{-I<N>>.) It refers to the I<N>th group before the +C<\g{-I<N>}>. -The big advantage of C<\g{-N}> is that it makes it much easier to write +The big advantage of this form is that it makes it much easier to write patterns with references that can be interpolated in larger patterns, even if the larger pattern also contains capture groups. -Mnemonic: I<g>roup. - =head4 Examples - /(A) # Buffer 1 - ( # Buffer 2 - (B) # Buffer 3 - \g{-1} # Refers to buffer 3 (B) - \g{-3} # Refers to buffer 1 (A) + /(A) # Group 1 + ( # Group 2 + (B) # Group 3 + \g{-1} # Refers to group 3 (B) + \g{-3} # Refers to group 1 (A) ) /x; # Matches "ABBA". @@ -395,17 +459,15 @@ Mnemonic: I<g>roup. =head3 Named referencing -Also new in perl 5.10.0 is the use of named capture buffers, which can be -referred to by name. This is done with C<\g{name}>, which is a -backreference to the capture buffer with the name I<name>. +C<\g{I<name>}> (starting in Perl 5.10.0) can be used to back refer to a +named capture group, dispensing completely with having to think about capture +buffer positions. To be compatible with .Net regular expressions, C<\g{name}> may also be written as C<\k{name}>, C<< \k<name> >> or C<\k'name'>. -Note that C<\g{}> has the potential to be ambiguous, as it could be a named -reference, or an absolute or relative reference (if its argument is numeric). -However, names are not allowed to start with digits, nor are they allowed to -contain a hyphen, so there is no ambiguity. +To prevent any ambiguity, I<name> must not start with a digit nor contain a +hyphen. =head4 Examples @@ -426,7 +488,7 @@ backslash sequences. =item \A C<\A> only matches at the beginning of the string. If the C</m> modifier -isn't used, then C</\A/> is equivalent with C</^/>. However, if the C</m> +isn't used, then C</\A/> is equivalent to C</^/>. However, if the C</m> modifier is used, then C</^/> matches internal newlines, but the meaning of C</\A/> isn't changed by the C</m> modifier. C<\A> matches at the beginning of the string regardless whether the C</m> modifier is used. @@ -434,26 +496,27 @@ of the string regardless whether the C</m> modifier is used. =item \z, \Z C<\z> and C<\Z> match at the end of the string. If the C</m> modifier isn't -used, then C</\Z/> is equivalent with C</$/>, that is, it matches at the -end of the string, or before the newline at the end of the string. If the +used, then C</\Z/> is equivalent to C</$/>; that is, it matches at the +end of the string, or one before the newline at the end of the string. If the C</m> modifier is used, then C</$/> matches at internal newlines, but the meaning of C</\Z/> isn't changed by the C</m> modifier. C<\Z> matches at the end of the string (or just before a trailing newline) regardless whether the C</m> modifier is used. -C<\z> is just like C<\Z>, except that it will not match before a trailing -newline. C<\z> will only match at the end of the string - regardless of the -modifiers used, and not before a newline. +C<\z> is just like C<\Z>, except that it does not match before a trailing +newline. C<\z> matches at the end of the string only, regardless of the +modifiers used, and not just before a newline. It is how to anchor the +match to the true end of the string under all conditions. =item \G -C<\G> is usually only used in combination with the C</g> modifier. If the -C</g> modifier is used (and the match is done in scalar context), Perl will -remember where in the source string the last match ended, and the next time, +C<\G> is usually used only in combination with the C</g> modifier. If the +C</g> modifier is used and the match is done in scalar context, Perl +remembers where in the source string the last match ended, and the next time, it will start the match from where it ended the previous time. -C<\G> matches the point where the previous match ended, or the beginning -of the string if there was no previous match. +C<\G> matches the point where the previous match on that string ended, +or the beginning of that string if there was no previous match. =for later add link to perlremodifiers @@ -466,7 +529,17 @@ matches at any place between characters where C<\b> doesn't match. C<\b> and C<\B> assume there's a non-word character before the beginning and after the end of the source string; so C<\b> will match at the beginning (or end) of the source string if the source string begins (or ends) with a word -character. Otherwise, C<\B> will match. +character. Otherwise, C<\B> will match. + +Do not use something like C<\b=head\d\b> and expect it to match the +beginning of a line. It can't, because for there to be a boundary before +the non-word "=", there must be a word character immediately previous. +All boundary determinations look for word characters alone, not for +non-words characters nor for string ends. It may help to understand how +<\b> and <\B> work by equating them as follows: + + \b really means (?:(?<=\w)(?!\w)|(?<!\w)(?=\w)) + \B really means (?:(?<=\w)(?=\w)|(?<!\w)(?!\w)) Mnemonic: I<b>oundary. @@ -494,7 +567,7 @@ Mnemonic: I<b>oundary. =head2 Misc Here we document the backslash sequences that don't fall in one of the -categories above. They are: +categories above. These are: =over 4 @@ -502,46 +575,59 @@ categories above. They are: C<\C> always matches a single octet, even if the source string is encoded in UTF-8 format, and the character to be matched is a multi-octet character. -C<\C> was introduced in perl 5.6. +C<\C> was introduced in perl 5.6. This is very dangerous, because it violates +the logical character abstraction and can cause UTF-8 sequences to become malformed. Mnemonic: oI<C>tet. =item \K -This is new in perl 5.10.0. Anything that is matched left of C<\K> is -not included in C<$&> - and will not be replaced if the pattern is -used in a substitution. This will allow you to write C<s/PAT1 \K PAT2/REPL/x> +This appeared in perl 5.10.0. Anything matched left of C<\K> is +not included in C<$&>, and will not be replaced if the pattern is +used in a substitution. This lets you write C<s/PAT1 \K PAT2/REPL/x> instead of C<s/(PAT1) PAT2/${1}REPL/x> or C<s/(?<=PAT1) PAT2/REPL/x>. Mnemonic: I<K>eep. =item \N -This is a new experimental feature in perl 5.12.0. It matches any character -that is not a newline. It is a short-hand for writing C<[^\n]>, and is +This is an experimental feature new to perl 5.12.0. It matches any character +that is B<not> a newline. It is a short-hand for writing C<[^\n]>, and is identical to the C<.> metasymbol, except under the C</s> flag, which changes the meaning of C<.>, but not C<\N>. Note that C<\N{...}> can mean a -L<named or numbered character|/Named or numbered characters>. +L<named or numbered character +|/Named or numbered characters and character sequences>. Mnemonic: Complement of I<\n>. =item \R X<\R> -C<\R> matches a I<generic newline>, that is, anything that is considered -a newline by Unicode. This includes all characters matched by C<\v> -(vertical whitespace), and the multi character sequence C<"\x0D\x0A"> -(carriage return followed by a line feed, aka the network newline, or -the newline used in Windows text files). C<\R> is equivalent to -C<< (?>\x0D\x0A)|\v) >>. Since C<\R> can match a sequence of more than one -character, it cannot be put inside a bracketed character class; C</[\R]/> is an -error; use C<\v> instead. C<\R> was introduced in perl 5.10.0. +C<\R> matches a I<generic newline>; that is, anything considered a +linebreak sequence by Unicode. This includes all characters matched by +C<\v> (vertical whitespace), and the multi character sequence C<"\x0D\x0A"> +(carriage return followed by a line feed, sometimes called the network +newline; it's the end of line sequence used in Microsoft text files opened +in binary mode). C<\R> is equivalent to C<< (?>\x0D\x0A|\v) >>. (The +reason it doesn't backtrack is that the sequence is considered +inseparable. That means that + + "\x0D\x0A" =~ /^\R\x0A$/ # No match + +fails, because the C<\R> matches the entire string, and won't backtrack +to match just the C<"\x0D">.) Since +C<\R> can match a sequence of more than one character, it cannot be put +inside a bracketed character class; C</[\R]/> is an error; use C<\v> +instead. C<\R> was introduced in perl 5.10.0. + +Note that this does not respect any locale that might be in effect; it +matches according to the platform's native character set. Mnemonic: none really. C<\R> was picked because PCRE already uses C<\R>, and more importantly because Unicode recommends such a regular expression -metacharacter, and suggests C<\R> as the notation. +metacharacter, and suggests C<\R> as its notation. =item \X X<\X> @@ -561,15 +647,15 @@ Mnemonic: eI<X>tended Unicode character. =head4 Examples - "\x{256}" =~ /^\C\C$/; # Match as chr (256) takes 2 octets in UTF-8. + "\x{256}" =~ /^\C\C$/; # Match as chr (0x256) takes 2 octets in UTF-8. - $str =~ s/foo\Kbar/baz/g; # Change any 'bar' following a 'foo' to 'baz'. - $str =~ s/(.)\K\1//g; # Delete duplicated characters. + $str =~ s/foo\Kbar/baz/g; # Change any 'bar' following a 'foo' to 'baz' + $str =~ s/(.)\K\g1//g; # Delete duplicated characters. "\n" =~ /^\R$/; # Match, \n is a generic newline. "\r" =~ /^\R$/; # Match, \r is a generic newline. "\r\n" =~ /^\R$/; # Match, \r\n is a generic newline. - "P\x{0307}" =~ /^\X$/ # \X matches a P with a dot above. + "P\x{307}" =~ /^\X$/ # \X matches a P with a dot above. =cut diff --git a/gnu/usr.bin/perl/pod/perlrecharclass.pod b/gnu/usr.bin/perl/pod/perlrecharclass.pod index 7c920083814..06d206b2f8b 100644 --- a/gnu/usr.bin/perl/pod/perlrecharclass.pod +++ b/gnu/usr.bin/perl/pod/perlrecharclass.pod @@ -9,27 +9,29 @@ The top level documentation about Perl regular expressions is found in L<perlre>. This manual page discusses the syntax and use of character -classes in Perl Regular Expressions. +classes in Perl regular expressions. -A character class is a way of denoting a set of characters, +A character class is a way of denoting a set of characters in such a way that one character of the set is matched. -It's important to remember that matching a character class +It's important to remember that: matching a character class consumes exactly one character in the source string. (The source string is the string the regular expression is matched against.) There are three types of character classes in Perl regular -expressions: the dot, backslashed sequences, and the form enclosed in square +expressions: the dot, backslash sequences, and the form enclosed in square brackets. Keep in mind, though, that often the term "character class" is used -to mean just the bracketed form. This is true in other Perl documentation. +to mean just the bracketed form. Certainly, most Perl documentation does that. =head2 The dot The dot (or period), C<.> is probably the most used, and certainly the most well-known character class. By default, a dot matches any -character, except for the newline. The default can be changed to -add matching the newline with the I<single line> modifier: either -for the entire regular expression using the C</s> modifier, or -locally using C<(?s)>. +character, except for the newline. That default can be changed to +add matching the newline by using the I<single line> modifier: either +for the entire regular expression with the C</s> modifier, or +locally with C<(?s)>. (The experimental C<\N> backslash sequence, described +below, matches any character except newline without regard to the +I<single line> modifier.) Here are some examples: @@ -41,175 +43,289 @@ Here are some examples: "\n" =~ /(?s:.)/ # Match (local 'single line' modifier) "ab" =~ /^.$/ # No match (dot matches one character) -=head2 Backslashed sequences -X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\p> X<\P> +=head2 Backslash sequences +X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\p> X<\P> X<\N> X<\v> X<\V> X<\h> X<\H> X<word> X<whitespace> -Perl regular expressions contain many backslashed sequences that -constitute a character class. That is, they will match a single -character, if that character belongs to a specific set of characters -(defined by the sequence). A backslashed sequence is a sequence of -characters starting with a backslash. Not all backslashed sequences -are character classes; for a full list, see L<perlrebackslash>. +A backslash sequence is a sequence of characters, the first one of which is a +backslash. Perl ascribes special meaning to many such sequences, and some of +these are character classes. That is, they match a single character each, +provided that the character belongs to the specific set of characters defined +by the sequence. -Here's a list of the backslashed sequences that are character classes. They -are discussed in more detail below. +Here's a list of the backslash sequences that are character classes. They +are discussed in more detail below. (For the backslash sequences that aren't +character classes, see L<perlrebackslash>.) - \d Match a digit character. - \D Match a non-digit character. + \d Match a decimal digit character. + \D Match a non-decimal-digit character. \w Match a "word" character. \W Match a non-"word" character. \s Match a whitespace character. \S Match a non-whitespace character. \h Match a horizontal whitespace character. \H Match a character that isn't horizontal whitespace. - \N Match a character that isn't newline. Experimental. \v Match a vertical whitespace character. \V Match a character that isn't vertical whitespace. - \pP, \p{Prop} Match a character matching a Unicode property. - \PP, \P{Prop} Match a character that doesn't match a Unicode property. + \N Match a character that isn't a newline. Experimental. + \pP, \p{Prop} Match a character that has the given Unicode property. + \PP, \P{Prop} Match a character that doesn't have the Unicode property + +=head3 \N + +C<\N> is new in 5.12, and is experimental. It, like the dot, matches any +character that is not a newline. The difference is that C<\N> is not influenced +by the I<single line> regular expression modifier (see L</The dot> above). Note +that the form C<\N{...}> may mean something completely different. When the +C<{...}> is a L<quantifier|perlre/Quantifiers>, it means to match a non-newline +character that many times. For example, C<\N{3}> means to match 3 +non-newlines; C<\N{5,}> means to match 5 or more non-newlines. But if C<{...}> +is not a legal quantifier, it is presumed to be a named character. See +L<charnames> for those. For example, none of C<\N{COLON}>, C<\N{4F}>, and +C<\N{F4}> contain legal quantifiers, so Perl will try to find characters whose +names are respectively C<COLON>, C<4F>, and C<F4>. =head3 Digits -C<\d> matches a single character that is considered to be a I<digit>. What is -considered a digit depends on the internal encoding of the source string and -the locale that is in effect. If the source string is in UTF-8 format, C<\d> -not only matches the digits '0' - '9', but also Arabic, Devanagari and digits -from other languages. Otherwise, if there is a locale in effect, it will match -whatever characters the locale considers digits. Without a locale, C<\d> -matches the digits '0' to '9'. See L</Locale, EBCDIC, Unicode and UTF-8>. - -Any character that isn't matched by C<\d> will be matched by C<\D>. +C<\d> matches a single character considered to be a decimal I<digit>. +If the C</a> regular expression modifier is in effect, it matches [0-9]. +Otherwise, it +matches anything that is matched by C<\p{Digit}>, which includes [0-9]. +(An unlikely possible exception is that under locale matching rules, the +current locale might not have [0-9] matched by C<\d>, and/or might match +other characters whose code point is less than 256. Such a locale +definition would be in violation of the C language standard, but Perl +doesn't currently assume anything in regard to this.) + +What this means is that unless the C</a> modifier is in effect C<\d> not +only matches the digits '0' - '9', but also Arabic, Devanagari, and +digits from other languages. This may cause some confusion, and some +security issues. + +Some digits that C<\d> matches look like some of the [0-9] ones, but +have different values. For example, BENGALI DIGIT FOUR (U+09EA) looks +very much like an ASCII DIGIT EIGHT (U+0038). An application that +is expecting only the ASCII digits might be misled, or if the match is +C<\d+>, the matched string might contain a mixture of digits from +different writing systems that look like they signify a number different +than they actually do. L<Unicode::UCD/num()> can +be used to safely +calculate the value, returning C<undef> if the input string contains +such a mixture. + +What C<\p{Digit}> means (and hence C<\d> except under the C</a> +modifier) is C<\p{General_Category=Decimal_Number}>, or synonymously, +C<\p{General_Category=Digit}>. Starting with Unicode version 4.1, this +is the same set of characters matched by C<\p{Numeric_Type=Decimal}>. +But Unicode also has a different property with a similar name, +C<\p{Numeric_Type=Digit}>, which matches a completely different set of +characters. These characters are things such as C<CIRCLED DIGIT ONE> +or subscripts, or are from writing systems that lack all ten digits. + +The design intent is for C<\d> to exactly match the set of characters +that can safely be used with "normal" big-endian positional decimal +syntax, where, for example 123 means one 'hundred', plus two 'tens', +plus three 'ones'. This positional notation does not necessarily apply +to characters that match the other type of "digit", +C<\p{Numeric_Type=Digit}>, and so C<\d> doesn't match them. + +The Tamil digits (U+0BE6 - U+0BEF) can also legally be +used in old-style Tamil numbers in which they would appear no more than +one in a row, separated by characters that mean "times 10", "times 100", +etc. (See L<http://www.unicode.org/notes/tn21>.) + +Any character not matched by C<\d> is matched by C<\D>. =head3 Word characters A C<\w> matches a single alphanumeric character (an alphabetic character, or a -decimal digit) or an underscore (C<_>), not a whole word. Use C<\w+> to match -a string of Perl-identifier characters (which isn't the same as matching an -English word). What is considered a word character depends on the internal -encoding of the string and the locale or EBCDIC code page that is in effect. If -it's in UTF-8 format, C<\w> matches those characters that are considered word -characters in the Unicode database. That is, it not only matches ASCII letters, -but also Thai letters, Greek letters, etc. If the source string isn't in UTF-8 -format, C<\w> matches those characters that are considered word characters by -the current locale or EBCDIC code page. Without a locale or EBCDIC code page, -C<\w> matches the ASCII letters, digits and the underscore. -See L</Locale, EBCDIC, Unicode and UTF-8>. - -Any character that isn't matched by C<\w> will be matched by C<\W>. +decimal digit) or a connecting punctuation character, such as an +underscore ("_"). It does not match a whole word. To match a whole +word, use C<\w+>. This isn't the same thing as matching an English word, but +in the ASCII range it is the same as a string of Perl-identifier +characters. + +=over + +=item If the C</a> modifier is in effect ... + +C<\w> matches the 63 characters [a-zA-Z0-9_]. + +=item otherwise ... + +=over + +=item For code points above 255 ... + +C<\w> matches the same as C<\p{Word}> matches in this range. That is, +it matches Thai letters, Greek letters, etc. This includes connector +punctuation (like the underscore) which connect two words together, or +diacritics, such as a C<COMBINING TILDE> and the modifier letters, which +are generally used to add auxiliary markings to letters. + +=item For code points below 256 ... + +=over + +=item if locale rules are in effect ... + +C<\w> matches the platform's native underscore character plus whatever +the locale considers to be alphanumeric. + +=item if Unicode rules are in effect or if on an EBCDIC platform ... + +C<\w> matches exactly what C<\p{Word}> matches. + +=item otherwise ... + +C<\w> matches [a-zA-Z0-9_]. + +=back + +=back + +=back + +Which rules apply are determined as described in L<perlre/Which character set modifier is in effect?>. + +There are a number of security issues with the full Unicode list of word +characters. See L<http://unicode.org/reports/tr36>. + +Also, for a somewhat finer-grained set of characters that are in programming +language identifiers beyond the ASCII range, you may wish to instead use the +more customized L</Unicode Properties>, C<\p{ID_Start}>, +C<\p{ID_Continue}>, C<\p{XID_Start}>, and C<\p{XID_Continue}>. See +L<http://unicode.org/reports/tr31>. + +Any character not matched by C<\w> is matched by C<\W>. =head3 Whitespace -C<\s> matches any single character that is considered whitespace. In the ASCII -range, C<\s> matches the horizontal tab (C<\t>), the new line (C<\n>), the form -feed (C<\f>), the carriage return (C<\r>), and the space. (The vertical tab, -C<\cK> is not matched by C<\s>.) The exact set of characters matched by C<\s> -depends on whether the source string is in UTF-8 format and the locale or -EBCDIC code page that is in effect. If it's in UTF-8 format, C<\s> matches what -is considered whitespace in the Unicode database; the complete list is in the -table below. Otherwise, if there is a locale or EBCDIC code page in effect, -C<\s> matches whatever is considered whitespace by the current locale or EBCDIC -code page. Without a locale or EBCDIC code page, C<\s> matches the five -characters mentioned in the beginning of this paragraph. Perhaps the most -notable possible surprise is that C<\s> matches a non-breaking space only if -the non-breaking space is in a UTF-8 encoded string or the locale or EBCDIC -code page that is in effect has that character. -See L</Locale, EBCDIC, Unicode and UTF-8>. - -Any character that isn't matched by C<\s> will be matched by C<\S>. - -C<\h> will match any character that is considered horizontal whitespace; -this includes the space and the tab characters and 17 other characters that are -listed in the table below. C<\H> will match any character -that is not considered horizontal whitespace. - -C<\N> is new in 5.12, and is experimental. It, like the dot, will match any -character that is not a newline. The difference is that C<\N> will not be -influenced by the single line C</s> regular expression modifier. Note that -there is a second meaning of C<\N> when of the form C<\N{...}>. This form is -for named characters. See L<charnames> for those. If C<\N> is followed by an -opening brace and something that is not a quantifier, perl will assume that a -character name is coming, and not this meaning of C<\N>. For example, C<\N{3}> -means to match 3 non-newlines; C<\N{5,}> means to match 5 or more non-newlines, -but C<\N{4F}> and C<\N{F4}> are not legal quantifiers, and will cause perl to -look for characters named C<4F> or C<F4>, respectively (and won't find them, -thus raising an error, unless they have been defined using custom names). - -C<\v> will match any character that is considered vertical whitespace; -this includes the carriage return and line feed characters (newline) plus 5 -other characters listed in the table below. -C<\V> will match any character that is not considered vertical whitespace. +C<\s> matches any single character considered whitespace. + +=over + +=item If the C</a> modifier is in effect ... + +C<\s> matches the 5 characters [\t\n\f\r ]; that is, the horizontal tab, +the newline, the form feed, the carriage return, and the space. (Note +that it doesn't match the vertical tab, C<\cK> on ASCII platforms.) + +=item otherwise ... + +=over + +=item For code points above 255 ... + +C<\s> matches exactly the code points above 255 shown with an "s" column +in the table below. + +=item For code points below 256 ... + +=over + +=item if locale rules are in effect ... + +C<\s> matches whatever the locale considers to be whitespace. Note that +this is likely to include the vertical space, unlike non-locale C<\s> +matching. + +=item if Unicode rules are in effect or if on an EBCDIC platform ... + +C<\s> matches exactly the characters shown with an "s" column in the +table below. + +=item otherwise ... + +C<\s> matches [\t\n\f\r ]. +Note that this list doesn't include the non-breaking space. + +=back + +=back + +=back + +Which rules apply are determined as described in L<perlre/Which character set modifier is in effect?>. + +Any character not matched by C<\s> is matched by C<\S>. + +C<\h> matches any character considered horizontal whitespace; +this includes the platform's space and tab characters and several others +listed in the table below. C<\H> matches any character +not considered horizontal whitespace. They use the platform's native +character set, and do not consider any locale that may otherwise be in +use. + +C<\v> matches any character considered vertical whitespace; +this includes the platform's carriage return and line feed characters (newline) +plus several other characters, all listed in the table below. +C<\V> matches any character not considered vertical whitespace. +They use the platform's native character set, and do not consider any +locale that may otherwise be in use. C<\R> matches anything that can be considered a newline under Unicode rules. It's not a character class, as it can match a multi-character sequence. Therefore, it cannot be used inside a bracketed character -class; use C<\v> instead (vertical whitespace). +class; use C<\v> instead (vertical whitespace). It uses the platform's +native character set, and does not consider any locale that may +otherwise be in use. Details are discussed in L<perlrebackslash>. -Note that unlike C<\s>, C<\d> and C<\w>, C<\h> and C<\v> always match -the same characters, regardless whether the source string is in UTF-8 -format or not. The set of characters they match is also not influenced -by locale nor EBCDIC code page. +Note that unlike C<\s> (and C<\d> and C<\w>), C<\h> and C<\v> always match +the same characters, without regard to other factors, such as the active +locale or whether the source string is in UTF-8 format. -One might think that C<\s> is equivalent to C<[\h\v]>. This is not true. The -vertical tab (C<"\x0b">) is not matched by C<\s>, it is however considered -vertical whitespace. Furthermore, if the source string is not in UTF-8 format, -and any locale or EBCDIC code page that is in effect doesn't include them, the -next line (C<"\x85">) and the no-break space (C<"\xA0">) characters are not -matched by C<\s>, but are by C<\v> and C<\h> respectively. If the source -string is in UTF-8 format, both the next line and the no-break space are -matched by C<\s>. +One might think that C<\s> is equivalent to C<[\h\v]>. This is not true. +The difference is that the vertical tab (C<"\x0b">) is not matched by +C<\s>; it is however considered vertical whitespace. The following table is a complete listing of characters matched by -C<\s>, C<\h> and C<\v> as of Unicode 5.2. +C<\s>, C<\h> and C<\v> as of Unicode 6.0. -The first column gives the code point of the character (in hex format), +The first column gives the Unicode code point of the character (in hex format), the second column gives the (Unicode) name. The third column indicates by which class(es) the character is matched (assuming no locale or EBCDIC code page is in effect that changes the C<\s> matching). - 0x00009 CHARACTER TABULATION h s - 0x0000a LINE FEED (LF) vs - 0x0000b LINE TABULATION v - 0x0000c FORM FEED (FF) vs - 0x0000d CARRIAGE RETURN (CR) vs - 0x00020 SPACE h s - 0x00085 NEXT LINE (NEL) vs [1] - 0x000a0 NO-BREAK SPACE h s [1] - 0x01680 OGHAM SPACE MARK h s - 0x0180e MONGOLIAN VOWEL SEPARATOR h s - 0x02000 EN QUAD h s - 0x02001 EM QUAD h s - 0x02002 EN SPACE h s - 0x02003 EM SPACE h s - 0x02004 THREE-PER-EM SPACE h s - 0x02005 FOUR-PER-EM SPACE h s - 0x02006 SIX-PER-EM SPACE h s - 0x02007 FIGURE SPACE h s - 0x02008 PUNCTUATION SPACE h s - 0x02009 THIN SPACE h s - 0x0200a HAIR SPACE h s - 0x02028 LINE SEPARATOR vs - 0x02029 PARAGRAPH SEPARATOR vs - 0x0202f NARROW NO-BREAK SPACE h s - 0x0205f MEDIUM MATHEMATICAL SPACE h s - 0x03000 IDEOGRAPHIC SPACE h s + 0x0009 CHARACTER TABULATION h s + 0x000a LINE FEED (LF) vs + 0x000b LINE TABULATION v + 0x000c FORM FEED (FF) vs + 0x000d CARRIAGE RETURN (CR) vs + 0x0020 SPACE h s + 0x0085 NEXT LINE (NEL) vs [1] + 0x00a0 NO-BREAK SPACE h s [1] + 0x1680 OGHAM SPACE MARK h s + 0x180e MONGOLIAN VOWEL SEPARATOR h s + 0x2000 EN QUAD h s + 0x2001 EM QUAD h s + 0x2002 EN SPACE h s + 0x2003 EM SPACE h s + 0x2004 THREE-PER-EM SPACE h s + 0x2005 FOUR-PER-EM SPACE h s + 0x2006 SIX-PER-EM SPACE h s + 0x2007 FIGURE SPACE h s + 0x2008 PUNCTUATION SPACE h s + 0x2009 THIN SPACE h s + 0x200a HAIR SPACE h s + 0x2028 LINE SEPARATOR vs + 0x2029 PARAGRAPH SEPARATOR vs + 0x202f NARROW NO-BREAK SPACE h s + 0x205f MEDIUM MATHEMATICAL SPACE h s + 0x3000 IDEOGRAPHIC SPACE h s =over 4 =item [1] -NEXT LINE and NO-BREAK SPACE only match C<\s> if the source string is in -UTF-8 format, or the locale or EBCDIC code page that is in effect includes them. +NEXT LINE and NO-BREAK SPACE may or may not match C<\s> depending +on the rules in effect. See +L<the beginning of this section|/Whitespace>. =back -It is worth noting that C<\d>, C<\w>, etc, match single characters, not -complete numbers or words. To match a number (that consists of integers), -use C<\d+>; to match a word, use C<\w+>. - - =head3 Unicode Properties C<\pP> and C<\p{Prop}> are character classes to match characters that fit given @@ -217,24 +333,59 @@ Unicode properties. One letter property names can be used in the C<\pP> form, with the property name following the C<\p>, otherwise, braces are required. When using braces, there is a single form, which is just the property name enclosed in the braces, and a compound form which looks like C<\p{name=value}>, -which means to match if the property "name" for the character has the particular +which means to match if the property "name" for the character has that particular "value". For instance, a match for a number can be written as C</\pN/> or as C</\p{Number}/>, or as C</\p{Number=True}/>. Lowercase letters are matched by the property I<Lowercase_Letter> which -has as short form I<Ll>. They need the braces, so are written as C</\p{Ll}/> or +has the short form I<Ll>. They need the braces, so are written as C</\p{Ll}/> or C</\p{Lowercase_Letter}/>, or C</\p{General_Category=Lowercase_Letter}/> (the underscores are optional). C</\pLl/> is valid, but means something different. It matches a two character string: a letter (Unicode property C<\pL>), followed by a lowercase C<l>. -For more details, see L<perlunicode/Unicode Character Properties>; for a +If neither the C</a> modifier nor locale rules are in effect, the use of +a Unicode property will force the regular expression into using Unicode +rules. + +Note that almost all properties are immune to case-insensitive matching. +That is, adding a C</i> regular expression modifier does not change what +they match. There are two sets that are affected. The first set is +C<Uppercase_Letter>, +C<Lowercase_Letter>, +and C<Titlecase_Letter>, +all of which match C<Cased_Letter> under C</i> matching. +The second set is +C<Uppercase>, +C<Lowercase>, +and C<Titlecase>, +all of which match C<Cased> under C</i> matching. +(The difference between these sets is that some things, such as Roman +numerals, come in both upper and lower case, so they are C<Cased>, but +aren't considered to be letters, so they aren't C<Cased_Letter>s. They're +actually C<Letter_Number>s.) +This set also includes its subsets C<PosixUpper> and C<PosixLower>, both +of which under C</i> match C<PosixAlpha>. + +For more details on Unicode properties, see L<perlunicode/Unicode +Character Properties>; for a complete list of possible properties, see -L<perluniprops/Properties accessible through \p{} and \P{}>. +L<perluniprops/Properties accessible through \p{} and \P{}>, +which notes all forms that have C</i> differences. It is also possible to define your own properties. This is discussed in L<perlunicode/User-Defined Character Properties>. +Unicode properties are defined (surprise!) only on Unicode code points. +A warning is raised and all matches fail on non-Unicode code points +(those above the legal Unicode maximum of 0x10FFFF). This can be +somewhat surprising, + + chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails. + chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Also fails! + +Even though these two matches might be thought of as complements, they +are so only on Unicode code points. =head4 Examples @@ -259,17 +410,21 @@ L<perlunicode/User-Defined Character Properties>. # Thai Unicode class. "a" =~ /\P{Lao}/ # Match, as "a" is not a Laotian character. +It is worth emphasizing that C<\d>, C<\w>, etc, match single characters, not +complete numbers or words. To match a number (that consists of digits), +use C<\d+>; to match a word, use C<\w+>. But be aware of the security +considerations in doing so, as mentioned above. =head2 Bracketed Character Classes The third form of character class you can use in Perl regular expressions -is the bracketed form. In its simplest form, it lists the characters +is the bracketed character class. In its simplest form, it lists the characters that may be matched, surrounded by square brackets, like this: C<[aeiou]>. This matches one of C<a>, C<e>, C<i>, C<o> or C<u>. Like the other -character classes, exactly one character will be matched. To match +character classes, exactly one character is matched.* To match a longer string consisting of characters mentioned in the character -class, follow the character class with a quantifier. For instance, -C<[aeiou]+> matches a string of one or more lowercase ASCII vowels. +class, follow the character class with a L<quantifier|perlre/Quantifiers>. For +instance, C<[aeiou]+> matches one or more lowercase English vowels. Repeating a character in a character class has no effect; it's considered to be in the set only once. @@ -282,6 +437,19 @@ Examples: # a single character. "ae" =~ /^[aeiou]+$/ # Match, due to the quantifier. + ------- + +* There is an exception to a bracketed character class matching a +single character only. When the class is to match caselessly under C</i> +matching rules, and a character inside the class matches a +multiple-character sequence caselessly under Unicode rules, the class +(when not L<inverted|/Negation>) will also match that sequence. For +example, Unicode says that the letter C<LATIN SMALL LETTER SHARP S> +should match the sequence C<ss> under C</i> rules. Thus, + + 'ss' =~ /\A\N{LATIN SMALL LETTER SHARP S}\z/i # Matches + 'ss' =~ /\A[aeioust\N{LATIN SMALL LETTER SHARP S}]\z/i # Matches + =head3 Special Characters Inside a Bracketed Character Class Most characters that are meta characters in regular expressions (that @@ -297,7 +465,7 @@ escaped with a backslash, although this is sometimes not needed, in which case the backslash may be omitted. The sequence C<\b> is special inside a bracketed character class. While -outside the character class C<\b> is an assertion indicating a point +outside the character class, C<\b> is an assertion indicating a point that does not have either two word characters or two non-word characters on either side, inside a bracketed character class, C<\b> matches a backspace character. @@ -309,23 +477,28 @@ C<\e>, C<\f>, C<\n>, C<\N{I<NAME>}>, -C<\N{U+I<wide hex char>}>, +C<\N{U+I<hex char>}>, C<\r>, C<\t>, and C<\x> -are also special and have the same meanings as they do outside a bracketed character -class. +are also special and have the same meanings as they do outside a +bracketed character class. (However, inside a bracketed character +class, if C<\N{I<NAME>}> expands to a sequence of characters, only the first +one in the sequence is used, with a warning.) Also, a backslash followed by two or three octal digits is considered an octal number. -A C<[> is not special inside a character class, unless it's the start -of a POSIX character class (see below). It normally does not need escaping. +A C<[> is not special inside a character class, unless it's the start of a +POSIX character class (see L</POSIX Character Classes> below). It normally does +not need escaping. + +A C<]> is normally either the end of a POSIX character class (see +L</POSIX Character Classes> below), or it signals the end of the bracketed +character class. If you want to include a C<]> in the set of characters, you +must generally escape it. -A C<]> is normally either the end of a POSIX character class (see below), or it -signals the end of the bracketed character class. If you want to include a -C<]> in the set of characters, you must generally escape it. However, if the C<]> is the I<first> (or the second if the first character is a caret) character of a bracketed character class, it does not denote the end of the class (as you cannot have an empty class) @@ -346,26 +519,26 @@ Examples: =head3 Character Ranges It is not uncommon to want to match a range of characters. Luckily, instead -of listing all the characters in the range, one may use the hyphen (C<->). +of listing all characters in the range, one may use the hyphen (C<->). If inside a bracketed character class you have two characters separated -by a hyphen, it's treated as if all the characters between the two are in +by a hyphen, it's treated as if all characters between the two were in the class. For instance, C<[0-9]> matches any ASCII digit, and C<[a-m]> matches any lowercase letter from the first half of the ASCII alphabet. Note that the two characters on either side of the hyphen are not -necessary both letters or both digits. Any character is possible, +necessarily both letters or both digits. Any character is possible, although not advisable. C<['-?]> contains a range of characters, but -most people will not know which characters that will be. Furthermore, +most people will not know which characters that means. Furthermore, such ranges may lead to portability problems if the code has to run on a platform that uses a different character set, such as EBCDIC. If a hyphen in a character class cannot syntactically be part of a range, for instance because it is the first or the last character of the character class, -or if it immediately follows a range, the hyphen isn't special, and will be -considered a character that may be matched literally. You have to escape the -hyphen with a backslash if you want to have a hyphen in your set of characters -to be matched, and its position in the class is such that it could be -considered part of a range. +or if it immediately follows a range, the hyphen isn't special, and so is +considered a character to be matched literally. If you want a hyphen in +your set of characters to be matched and its position in the class is such +that it could be considered part of a range, you must escape that hyphen +with a backslash. Examples: @@ -383,13 +556,28 @@ Examples: It is also possible to instead list the characters you do not want to match. You can do so by using a caret (C<^>) as the first character in the -character class. For instance, C<[^a-z]> matches a character that is not a -lowercase ASCII letter. +character class. For instance, C<[^a-z]> matches any character that is not a +lowercase ASCII letter, which therefore includes more than a million +Unicode code points. The class is said to be "negated" or "inverted". This syntax make the caret a special character inside a bracketed character class, but only if it is the first character of the class. So if you want -to have the caret as one of the characters you want to match, you either -have to escape the caret, or not list it first. +the caret as one of the characters to match, either escape the caret or +else don't list it first. + +In inverted bracketed character classes, Perl ignores the Unicode rules +that normally say that certain characters should match a sequence of +multiple characters under caseless C</i> matching. Following those +rules could lead to highly confusing situations: + + "ss" =~ /^[^\xDF]+$/ui; # Matches! + +This should match any sequences of characters that aren't C<\xDF> nor +what C<\xDF> matches under C</i>. C<"s"> isn't C<\xDF>, but Unicode +says that C<"ss"> is what C<\xDF> matches under C</i>. So which one +"wins"? Do you fail the match because the string has C<ss> or accept it +because it has an C<s> followed by another C<s>? Perl has chosen the +latter. Examples: @@ -401,15 +589,17 @@ Examples: =head3 Backslash Sequences You can put any backslash sequence character class (with the exception of -C<\N>) inside a bracketed character class, and it will act just -as if you put all the characters matched by the backslash sequence inside the -character class. For instance, C<[a-f\d]> will match any digit, or any of the -lowercase letters between 'a' and 'f' inclusive. +C<\N> and C<\R>) inside a bracketed character class, and it will act just +as if you had put all characters matched by the backslash sequence inside the +character class. For instance, C<[a-f\d]> matches any decimal digit, or any +of the lowercase letters between 'a' and 'f' inclusive. + +C<\N> within a bracketed character class must be of the forms C<\N{I<name>}> +or C<\N{U+I<hex char>}>, and NOT be the form that matches non-newlines, +for the same reason that a dot C<.> inside a bracketed character class loses +its special meaning: it matches nearly anything, which generally isn't what you +want to happen. -C<\N> within a bracketed character class must be of the forms C<\N{I<name>}> or -C<\N{U+I<wide hex char>}> for the same reason that a dot C<.> inside a -bracketed character class loses its special meaning: it matches nearly -anything, which generally isn't what you want to happen. Examples: @@ -419,19 +609,21 @@ Examples: # character, nor a parenthesis. Backslash sequence character classes cannot form one of the endpoints -of a range. +of a range. Thus, you can't say: + + /[\p{Thai}-\d]/ # Wrong! -=head3 Posix Character Classes +=head3 POSIX Character Classes X<character class> X<\p> X<\p{}> X<alpha> X<alnum> X<ascii> X<blank> X<cntrl> X<digit> X<graph> X<lower> X<print> X<punct> X<space> X<upper> X<word> X<xdigit> -Posix character classes have the form C<[:class:]>, where I<class> is -name, and the C<[:> and C<:]> delimiters. Posix character classes only appear +POSIX character classes have the form C<[:class:]>, where I<class> is +name, and the C<[:> and C<:]> delimiters. POSIX character classes only appear I<inside> bracketed character classes, and are a convenient and descriptive -way of listing a group of characters, though they currently suffer from -portability issues (see below and L<Locale, EBCDIC, Unicode and UTF-8>). Be -careful about the syntax, +way of listing a group of characters. + +Be careful about the syntax, # Correct: $string =~ /[[:alpha:]]/ @@ -441,8 +633,8 @@ careful about the syntax, The latter pattern would be a character class consisting of a colon, and the letters C<a>, C<l>, C<p> and C<h>. -These character classes can be part of a larger bracketed character class. For -example, +POSIX character classes can be part of a larger bracketed character class. +For example, [01[:alpha:]%] @@ -451,7 +643,7 @@ is valid and matches '0', '1', any alphabetic character, and the percent sign. Perl recognizes the following POSIX character classes: alpha Any alphabetical character ("[A-Za-z]"). - alnum Any alphanumerical character. ("[A-Za-z0-9]") + alnum Any alphanumeric character. ("[A-Za-z0-9]") ascii Any character in the ASCII character set. blank A GNU extension, equal to a space or a horizontal tab ("\t"). cntrl Any control character. See Note [2] below. @@ -471,56 +663,34 @@ derived from official Unicode properties.) The table below shows the relation between POSIX character classes and these counterparts. One counterpart, in the column labelled "ASCII-range Unicode" in -the table will only match characters in the ASCII range. (On EBCDIC platforms, -they match those characters which have ASCII equivalents.) +the table, matches only characters in the ASCII character set. The other counterpart, in the column labelled "Full-range Unicode", matches any appropriate characters in the full Unicode character set. For example, -C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any -character in the entire Unicode character set that is considered to be -alphabetic. - -(Each of the counterparts has various synonyms as well. -L<perluniprops/Properties accessible through \p{} and \P{}> lists all the -synonyms, plus all the characters matched by each of the ASCII-range -properties. For example C<\p{AHex}> is a synonym for C<\p{ASCII_Hex_Digit}>, -and any C<\p> property name can be prefixed with "Is" such as C<\p{IsAlpha}>.) - -Both the C<\p> forms are unaffected by any locale that is in effect, or whether -the string is in UTF-8 format or not, or whether the platform is EBCDIC or not. -In contrast, the POSIX character classes are affected. If the source string is -in UTF-8 format, the POSIX classes (with the exception of C<[[:punct:]]>, see -Note [5]) behave like their "Full-range" Unicode counterparts. If the source -string is not in UTF-8 format, and no locale is in effect, and the platform is -not EBCDIC, all the POSIX classes behave like their ASCII-range counterparts. -Otherwise, they behave based on the rules of the locale or EBCDIC code page. -It is proposed to change this behavior in a future release of Perl so that the -the UTF8ness of the source string will be irrelevant to the behavior of the -POSIX character classes. This means they will always behave in strict -accordance with the official POSIX standard. That is, if either locale or -EBCDIC code page is present, they will behave in accordance with those; if -absent, the classes will match only their ASCII-range counterparts. If you -disagree with this proposal, send email to C<perl5-porters@perl.org>. - - [[:...:]] ASCII-range Full-range backslash Note - Unicode Unicode sequence +C<\p{Alpha}> matches not just the ASCII alphabetic characters, but any +character in the entire Unicode character set considered alphabetic. +An entry in the column labelled "backslash sequence" is a (short) +equivalent. + + [[:...:]] ASCII-range Full-range backslash Note + Unicode Unicode sequence ----------------------------------------------------- - alpha \p{PosixAlpha} \p{Alpha} - alnum \p{PosixAlnum} \p{Alnum} - ascii \p{ASCII} - blank \p{PosixBlank} \p{Blank} = [1] - \p{HorizSpace} \h [1] - cntrl \p{PosixCntrl} \p{Cntrl} [2] - digit \p{PosixDigit} \p{Digit} \d - graph \p{PosixGraph} \p{Graph} [3] - lower \p{PosixLower} \p{Lower} - print \p{PosixPrint} \p{Print} [4] - punct \p{PosixPunct} \p{Punct} [5] - \p{PerlSpace} \p{SpacePerl} \s [6] - space \p{PosixSpace} \p{Space} [6] - upper \p{PosixUpper} \p{Upper} - word \p{PerlWord} \p{Word} \w - xdigit \p{ASCII_Hex_Digit} \p{XDigit} + alpha \p{PosixAlpha} \p{XPosixAlpha} + alnum \p{PosixAlnum} \p{XPosixAlnum} + ascii \p{ASCII} + blank \p{PosixBlank} \p{XPosixBlank} \h [1] + or \p{HorizSpace} [1] + cntrl \p{PosixCntrl} \p{XPosixCntrl} [2] + digit \p{PosixDigit} \p{XPosixDigit} \d + graph \p{PosixGraph} \p{XPosixGraph} [3] + lower \p{PosixLower} \p{XPosixLower} + print \p{PosixPrint} \p{XPosixPrint} [4] + punct \p{PosixPunct} \p{XPosixPunct} [5] + \p{PerlSpace} \p{XPerlSpace} \s [6] + space \p{PosixSpace} \p{XPosixSpace} [6] + upper \p{PosixUpper} \p{XPosixUpper} + word \p{PosixWord} \p{XPosixWord} \w + xdigit \p{PosixXDigit} \p{XPosixXDigit} =over 4 @@ -531,47 +701,116 @@ C<\p{Blank}> and C<\p{HorizSpace}> are synonyms. =item [2] Control characters don't produce output as such, but instead usually control -the terminal somehow: for example newline and backspace are control characters. -In the ASCII range, characters whose ordinals are between 0 and 31 inclusive, +the terminal somehow: for example, newline and backspace are control characters. +In the ASCII range, characters whose code points are between 0 and 31 inclusive, plus 127 (C<DEL>) are control characters. On EBCDIC platforms, it is likely that the code page will define C<[[:cntrl:]]> to be the EBCDIC equivalents of the ASCII controls, plus the controls -that in Unicode have ordinals from 128 through 139. +that in Unicode have code pointss from 128 through 159. =item [3] Any character that is I<graphical>, that is, visible. This class consists -of all the alphanumerical characters and all punctuation characters. +of all alphanumeric characters and all punctuation characters. =item [4] -All printable characters, which is the set of all the graphical characters -plus whitespace characters that are not also controls. +All printable characters, which is the set of all graphical characters +plus those whitespace characters which are not also controls. =item [5] -C<\p{PosixPunct}> and C<[[:punct:]]> in the ASCII range match all the +C<\p{PosixPunct}> and C<[[:punct:]]> in the ASCII range match all non-controls, non-alphanumeric, non-space characters: C<[-!"#$%&'()*+,./:;<=E<gt>?@[\\\]^_`{|}~]> (although if a locale is in effect, it could alter the behavior of C<[[:punct:]]>). -When the matching string is in UTF-8 format, C<[[:punct:]]> matches the above -set, plus what C<\p{Punct}> matches. This is different than strictly matching -according to C<\p{Punct}>, because the above set includes characters that aren't -considered punctuation by Unicode, but rather "symbols". Another way to say it -is that for a UTF-8 string, C<[[:punct:]]> matches all the characters that -Unicode considers to be punctuation, plus all the ASCII-range characters that -Unicode considers to be symbols. +The similarly named property, C<\p{Punct}>, matches a somewhat different +set in the ASCII range, namely +C<[-!"#%&'()*,./:;?@[\\\]_{}]>. That is, it is missing the nine +characters C<[$+E<lt>=E<gt>^`|~]>. +This is because Unicode splits what POSIX considers to be punctuation into two +categories, Punctuation and Symbols. + +C<\p{XPosixPunct}> and (under Unicode rules) C<[[:punct:]]>, match what +C<\p{PosixPunct}> matches in the ASCII range, plus what C<\p{Punct}> +matches. This is different than strictly matching according to +C<\p{Punct}>. Another way to say it is that +if Unicode rules are in effect, C<[[:punct:]]> matches all characters +that Unicode considers punctuation, plus all ASCII-range characters that +Unicode considers symbols. =item [6] -C<\p{SpacePerl}> and C<\p{Space}> differ only in that C<\p{Space}> additionally +C<\p{SpacePerl}> and C<\p{Space}> differ only in that in non-locale +matching, C<\p{Space}> additionally matches the vertical tab, C<\cK>. Same for the two ASCII-only range forms. =back -=head4 Negation +There are various other synonyms that can be used besides the names +listed in the table. For example, C<\p{PosixAlpha}> can be written as +C<\p{Alpha}>. All are listed in +L<perluniprops/Properties accessible through \p{} and \P{}>, +plus all characters matched by each ASCII-range property. + +Both the C<\p> counterparts always assume Unicode rules are in effect. +On ASCII platforms, this means they assume that the code points from 128 +to 255 are Latin-1, and that means that using them under locale rules is +unwise unless the locale is guaranteed to be Latin-1 or UTF-8. In contrast, the +POSIX character classes are useful under locale rules. They are +affected by the actual rules in effect, as follows: + +=over + +=item If the C</a> modifier, is in effect ... + +Each of the POSIX classes matches exactly the same as their ASCII-range +counterparts. + +=item otherwise ... + +=over + +=item For code points above 255 ... + +The POSIX class matches the same as its Full-range counterpart. + +=item For code points below 256 ... + +=over + +=item if locale rules are in effect ... + +The POSIX class matches according to the locale, except that +C<word> uses the platform's native underscore character, no matter what +the locale is. + +=item if Unicode rules are in effect or if on an EBCDIC platform ... + +The POSIX class matches the same as the Full-range counterpart. + +=item otherwise ... + +The POSIX class matches the same as the ASCII range counterpart. + +=back + +=back + +=back + +Which rules apply are determined as described in +L<perlre/Which character set modifier is in effect?>. + +It is proposed to change this behavior in a future release of Perl so that +whether or not Unicode rules are in effect would not change the +behavior: Outside of locale or an EBCDIC code page, the POSIX classes +would behave like their ASCII-range counterparts. If you wish to +comment on this proposal, send email to C<perl5-porters@perl.org>. + +=head4 Negation of POSIX character classes X<character class, negation> A Perl extension to the POSIX character class is the ability to @@ -581,17 +820,19 @@ Some examples: POSIX ASCII-range Full-range backslash Unicode Unicode sequence ----------------------------------------------------- - [[:^digit:]] \P{PosixDigit} \P{Digit} \D - [[:^space:]] \P{PosixSpace} \P{Space} - \P{PerlSpace} \P{SpacePerl} \S - [[:^word:]] \P{PerlWord} \P{Word} \W + [[:^digit:]] \P{PosixDigit} \P{XPosixDigit} \D + [[:^space:]] \P{PosixSpace} \P{XPosixSpace} + \P{PerlSpace} \P{XPerlSpace} \S + [[:^word:]] \P{PerlWord} \P{XPosixWord} \W -=head4 [= =] and [. .] +The backslash sequence can mean either ASCII- or Full-range Unicode, +depending on various factors as described in L<perlre/Which character set modifier is in effect?>. -Perl will recognize the POSIX character classes C<[=class=]>, and -C<[.class.]>, but does not (yet?) support them. Use of -such a construct will lead to an error. +=head4 [= =] and [. .] +Perl recognizes the POSIX character classes C<[=class=]> and +C<[.class.]>, but does not (yet?) support them. Any attempt to use +either construct raises an exception. =head4 Examples @@ -607,44 +848,3 @@ such a construct will lead to an error. # hex digit. The result matches all # characters except the letters 'a' to 'f' and # 'A' to 'F'. - - -=head2 Locale, EBCDIC, Unicode and UTF-8 - -Some of the character classes have a somewhat different behaviour depending -on the internal encoding of the source string, and the locale that is -in effect, and if the program is running on an EBCDIC platform. - -C<\w>, C<\d>, C<\s> and the POSIX character classes (and their negations, -including C<\W>, C<\D>, C<\S>) suffer from this behaviour. (Since the backslash -sequences C<\b> and C<\B> are defined in terms of C<\w> and C<\W>, they also are -affected.) - -The rule is that if the source string is in UTF-8 format, the character -classes match according to the Unicode properties. If the source string -isn't, then the character classes match according to whatever locale or EBCDIC -code page is in effect. If there is no locale nor EBCDIC, they match the ASCII -defaults (52 letters, 10 digits and underscore for C<\w>; 0 to 9 for C<\d>; -etc.). - -This usually means that if you are matching against characters whose C<ord()> -values are between 128 and 255 inclusive, your character class may match -or not depending on the current locale or EBCDIC code page, and whether the -source string is in UTF-8 format. The string will be in UTF-8 format if it -contains characters whose C<ord()> value exceeds 255. But a string may be in -UTF-8 format without it having such characters. See L<perluniprops/The -"Unicode Bug">. - -For portability reasons, it may be better to not use C<\w>, C<\d>, C<\s> -or the POSIX character classes, and use the Unicode properties instead. - -=head4 Examples - - $str = "\xDF"; # $str is not in UTF-8 format. - $str =~ /^\w/; # No match, as $str isn't in UTF-8 format. - $str .= "\x{0e0b}"; # Now $str is in UTF-8 format. - $str =~ /^\w/; # Match! $str is now in UTF-8 format. - chop $str; - $str =~ /^\w/; # Still a match! $str remains in UTF-8 format. - -=cut diff --git a/gnu/usr.bin/perl/pod/perlreftut.pod b/gnu/usr.bin/perl/pod/perlreftut.pod index 7898b6db53c..9565562711d 100644 --- a/gnu/usr.bin/perl/pod/perlreftut.pod +++ b/gnu/usr.bin/perl/pod/perlreftut.pod @@ -7,7 +7,7 @@ perlreftut - Mark's very short tutorial about references One of the most important new features in Perl 5 was the capability to manage complicated data structures like multidimensional arrays and nested hashes. To enable these, Perl 5 introduced a feature called -`references', and using references is the key to managing complicated, +'references', and using references is the key to managing complicated, structured data in Perl. Unfortunately, there's a lot of funny syntax to learn, and the main manual page can be hard to follow. The manual is quite complete, and sometimes people find that a problem, because @@ -402,7 +402,7 @@ This is Perl, so it does the exact right thing. It sees that you want to push C<Athens> onto an array that doesn't exist, so it helpfully makes a new, empty, anonymous array for you, installs it into C<%table>, and then pushes C<Athens> onto it. This is called -`autovivification'--bringing things to life automatically. Perl saw +'autovivification'--bringing things to life automatically. Perl saw that they key wasn't in the hash, so it created a new hash entry automatically. Perl saw that you wanted to use the hash value as an array, so it created a new empty array and installed a reference to it diff --git a/gnu/usr.bin/perl/pod/perlrequick.pod b/gnu/usr.bin/perl/pod/perlrequick.pod index 4b5e19a0fb1..bd44d013c54 100644 --- a/gnu/usr.bin/perl/pod/perlrequick.pod +++ b/gnu/usr.bin/perl/pod/perlrequick.pod @@ -19,7 +19,7 @@ contains that word: "Hello World" =~ /World/; # matches In this statement, C<World> is a regex and the C<//> enclosing -C</World/> tells perl to search a string for a match. The operator +C</World/> tells Perl to search a string for a match. The operator C<=~> associates the string with the regex match and produces a true value if the regex matched, or false if the regex did not match. In our case, C<World> matches the second word in C<"Hello World">, so the @@ -58,7 +58,7 @@ statement to be true: "Hello World" =~ /o W/; # matches, ' ' is an ordinary char "Hello World" =~ /World /; # doesn't match, no ' ' at end -perl will always match at the earliest possible point in the string: +Perl will always match at the earliest possible point in the string: "Hello World" =~ /o/; # matches 'o' in 'Hello' "That hat is red" =~ /hat/; # matches 'hat' in 'That' @@ -88,7 +88,7 @@ e.g., C<\x1B>: "1000\t2000" =~ m(0\t2) # matches "cat" =~ /\143\x61\x74/ # matches in ASCII, but a weird way to spell cat -Regexes are treated mostly as double quoted strings, so variable +Regexes are treated mostly as double-quoted strings, so variable substitution works: $foo = 'house'; @@ -161,7 +161,10 @@ character, or the match fails. Then /[^0-9]/; # matches a non-numeric character /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary -Perl has several abbreviations for common character classes: +Perl has several abbreviations for common character classes. (These +definitions are those that Perl uses in ASCII-safe mode with the C</a> modifier. +Otherwise they could match many more non-ASCII Unicode characters as +well. See L<perlrecharclass/Backslash sequences> for details.) =over 4 @@ -233,11 +236,11 @@ boundary. We can match different character strings with the B<alternation> metacharacter C<'|'>. To match C<dog> or C<cat>, we form the regex -C<dog|cat>. As before, perl will try to match the regex at the +C<dog|cat>. As before, Perl will try to match the regex at the earliest possible point in the string. At each character position, -perl will first try to match the first alternative, C<dog>. If -C<dog> doesn't match, perl will then try the next alternative, C<cat>. -If C<cat> doesn't match either, then the match fails and perl moves to +Perl will first try to match the first alternative, C<dog>. If +C<dog> doesn't match, Perl will then try the next alternative, C<cat>. +If C<cat> doesn't match either, then the match fails and Perl moves to the next position in the string. Some examples: "cats and dogs" =~ /cat|dog|bird/; # matches "cat" @@ -298,13 +301,13 @@ indicated below it: 1 2 34 Associated with the matching variables C<$1>, C<$2>, ... are -the B<backreferences> C<\1>, C<\2>, ... Backreferences are +the B<backreferences> C<\g1>, C<\g2>, ... Backreferences are matching variables that can be used I<inside> a regex: - /(\w\w\w)\s\1/; # find sequences like 'the the' in string + /(\w\w\w)\s\g1/; # find sequences like 'the the' in string -C<$1>, C<$2>, ... should only be used outside of a regex, and C<\1>, -C<\2>, ... only inside a regex. +C<$1>, C<$2>, ... should only be used outside of a regex, and C<\g1>, +C<\g2>, ... only inside a regex. =head2 Matching repetitions @@ -347,10 +350,10 @@ Here are some examples: /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and # any number of digits - /(\w+)\s+\1/; # match doubled words of arbitrary length - $year =~ /\d{2,4}/; # make sure year is at least 2 but not more - # than 4 digits - $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates + /(\w+)\s+\g1/; # match doubled words of arbitrary length + $year =~ /^\d{2,4}$/; # make sure year is at least 2 but not more + # than 4 digits + $year =~ /^\d{4}$|^\d{2}$/; # better match; throw out 3 digit dates These quantifiers will try to match as much of the string as possible, while still allowing the regex to match. So we have @@ -368,22 +371,7 @@ no string left to it, so it matches 0 times. =head2 More matching There are a few more things you might want to know about matching -operators. In the code - - $pattern = 'Seuss'; - while (<>) { - print if /$pattern/; - } - -perl has to re-evaluate C<$pattern> each time through the loop. If -C<$pattern> won't be changing, use the C<//o> modifier, to only -perform variable substitutions once. If you don't want any -substitutions at all, use the special delimiter C<m''>: - - @pattern = ('Seuss'); - m/@pattern/; # matches 'Seuss' - m'@pattern'; # matches the literal string '@pattern' - +operators. The global modifier C<//g> allows the matching operator to match within a string as many times as possible. In scalar context, successive matches against a string will have C<//g> jump from match @@ -417,11 +405,11 @@ there are no groupings, a list of matches to the whole regex. So =head2 Search and replace Search and replace is performed using C<s/regex/replacement/modifiers>. -The C<replacement> is a Perl double quoted string that replaces in the +The C<replacement> is a Perl double-quoted string that replaces in the string whatever is matched with the C<regex>. The operator C<=~> is also used here to associate a string with C<s///>. If matching -against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match, -C<s///> returns the number of substitutions made, otherwise it returns +against C<$_>, the S<C<$_ =~>> can be dropped. If there is a match, +C<s///> returns the number of substitutions made; otherwise it returns false. Here are a few examples: $x = "Time to feed the cat!"; @@ -440,6 +428,21 @@ of the regex in the string: $x = "I batted 4 for 4"; $x =~ s/4/four/g; # $x contains "I batted four for four" +The non-destructive modifier C<s///r> causes the result of the substitution +to be returned instead of modifying C<$_> (or whatever variable the +substitute was bound to with C<=~>): + + $x = "I like dogs."; + $y = $x =~ s/dogs/cats/r; + print "$x $y\n"; # prints "I like dogs. I like cats." + + $x = "Cats are great."; + print $x =~ s/Cats/Dogs/r =~ s/Dogs/Frogs/r =~ s/Frogs/Hedgehogs/r, "\n"; + # prints "Hedgehogs are great." + + @foo = map { s/[a-z]/X/r } qw(a b c 1 2 3); + # @foo is now qw(X X X 1 2 3) + The evaluation modifier C<s///e> wraps an C<eval{...}> around the replacement string and the evaluated result is substituted for the matched substring. Some examples: @@ -454,7 +457,7 @@ matched substring. Some examples: The last example shows that C<s///> can use other delimiters, such as C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are used -C<s'''>, then the regex and replacement are treated as single quoted +C<s'''>, then the regex and replacement are treated as single-quoted strings. =head2 The split operator diff --git a/gnu/usr.bin/perl/pod/perlreref.pod b/gnu/usr.bin/perl/pod/perlreref.pod index 94ac5dcef5d..954a423759c 100644 --- a/gnu/usr.bin/perl/pod/perlreref.pod +++ b/gnu/usr.bin/perl/pod/perlreref.pod @@ -21,7 +21,7 @@ false if the match succeeds, and true if it fails. $var !~ /foo/; -C<m/pattern/msixpogc> searches a string for a pattern match, +C<m/pattern/msixpogcdual> searches a string for a pattern match, applying the given options. m Multiline mode - ^ and $ match internal lines @@ -33,21 +33,28 @@ applying the given options. o compile pattern Once g Global - all occurrences c don't reset pos on failed matches when using /g + a restrict \d, \s, \w and [:posix:] to match ASCII only + aa (two a's) also /i matches exclude ASCII/non-ASCII + l match according to current locale + u match according to Unicode rules + d match according to native rules unless something indicates + Unicode If 'pattern' is an empty string, the last I<successfully> matched regex is used. Delimiters other than '/' may be used for both this operator and the following ones. The leading C<m> can be omitted if the delimiter is '/'. -C<qr/pattern/msixpo> lets you store a regex in a variable, +C<qr/pattern/msixpodual> lets you store a regex in a variable, or pass one around. Modifiers as for C<m//>, and are stored within the regex. -C<s/pattern/replacement/msixpogce> substitutes matches of +C<s/pattern/replacement/msixpogcedual> substitutes matches of 'pattern' with 'replacement'. Modifiers as for C<m//>, -with one addition: +with two additions: e Evaluate 'replacement' as an expression + r Return substitution and leave the original string untouched. 'e' may be specified multiple times. 'replacement' is interpreted as a double quoted string unless a single-quote (C<'>) is the delimiter. @@ -57,25 +64,26 @@ delimiters can be used. Must be reset with reset(). =head2 SYNTAX - \ Escapes the character immediately following it - . Matches any single character except a newline (unless /s is used) - ^ Matches at the beginning of the string (or line, if /m is used) - $ Matches at the end of the string (or line, if /m is used) - * Matches the preceding element 0 or more times - + Matches the preceding element 1 or more times - ? Matches the preceding element 0 or 1 times - {...} Specifies a range of occurrences for the element preceding it - [...] Matches any one of the characters contained within the brackets - (...) Groups subexpressions for capturing to $1, $2... - (?:...) Groups subexpressions without capturing (cluster) - | Matches either the subexpression preceding or following it - \1, \2, \3 ... Matches the text from the Nth group - \g1 or \g{1}, \g2 ... Matches the text from the Nth group - \g-1 or \g{-1}, \g-2 ... Matches the text from the Nth previous group - \g{name} Named backreference - \k<name> Named backreference - \k'name' Named backreference - (?P=name) Named backreference (python syntax) + \ Escapes the character immediately following it + . Matches any single character except a newline (unless /s is + used) + ^ Matches at the beginning of the string (or line, if /m is used) + $ Matches at the end of the string (or line, if /m is used) + * Matches the preceding element 0 or more times + + Matches the preceding element 1 or more times + ? Matches the preceding element 0 or 1 times + {...} Specifies a range of occurrences for the element preceding it + [...] Matches any one of the characters contained within the brackets + (...) Groups subexpressions for capturing to $1, $2... + (?:...) Groups subexpressions without capturing (cluster) + | Matches either the subexpression preceding or following it + \g1 or \g{1}, \g2 ... Matches the text from the Nth group + \1, \2, \3 ... Matches the text from the Nth group + \g-1 or \g{-1}, \g-2 ... Matches the text from the Nth previous group + \g{name} Named backreference + \k<name> Named backreference + \k'name' Named backreference + (?P=name) Named backreference (python syntax) =head2 ESCAPE SEQUENCES @@ -87,17 +95,19 @@ These work as in normal strings. \n Newline \r Carriage return \t Tab - \037 Any octal ASCII value - \x7f Any hexadecimal ASCII value - \x{263a} A wide hexadecimal value + \037 Char whose ordinal is the 3 octal digits, max \777 + \o{2307} Char whose ordinal is the octal number, unrestricted + \x7f Char whose ordinal is the 2 hex digits, max \xFF + \x{263a} Char whose ordinal is the hex number, unrestricted \cx Control-x - \N{name} A named character + \N{name} A named Unicode character or character sequence \N{U+263D} A Unicode character by hex ordinal \l Lowercase next character \u Titlecase next character \L Lowercase until \E \U Uppercase until \E + \F Foldcase until \E \Q Disable pattern metacharacters until \E \E End modification @@ -126,9 +136,9 @@ and L<perlunicode> for details. \S A non-whitespace character \h An horizontal whitespace \H A non horizontal whitespace - \N A non newline (when not followed by '{NAME}'; experimental; not - valid in a character class; equivalent to [^\n]; it's like '.' - without /s modifier) + \N A non newline (when not followed by '{NAME}'; experimental; + not valid in a character class; equivalent to [^\n]; it's + like '.' without /s modifier) \v A vertical whitespace \V A non vertical whitespace \R A generic newline (?>\v|\x0D\x0A) @@ -142,27 +152,46 @@ and L<perlunicode> for details. POSIX character classes and their Unicode and Perl equivalents: - alnum IsAlnum Alphanumeric - alpha IsAlpha Alphabetic - ascii IsASCII Any ASCII char - blank IsSpace [ \t] Horizontal whitespace (GNU extension) - cntrl IsCntrl Control characters - digit IsDigit \d Digits - graph IsGraph Alphanumeric and punctuation - lower IsLower Lowercase chars (locale and Unicode aware) - print IsPrint Alphanumeric, punct, and space - punct IsPunct Punctuation - space IsSpace [\s\ck] Whitespace - IsSpacePerl \s Perl's whitespace definition - upper IsUpper Uppercase chars (locale and Unicode aware) - word IsWord \w Alphanumeric plus _ (Perl extension) - xdigit IsXDigit [0-9A-Fa-f] Hexadecimal digit + ASCII- Full- + POSIX range range backslash + [[:...:]] \p{...} \p{...} sequence Description + + ----------------------------------------------------------------------- + alnum PosixAlnum XPosixAlnum Alpha plus Digit + alpha PosixAlpha XPosixAlpha Alphabetic characters + ascii ASCII Any ASCII character + blank PosixBlank XPosixBlank \h Horizontal whitespace; + full-range also + written as + \p{HorizSpace} (GNU + extension) + cntrl PosixCntrl XPosixCntrl Control characters + digit PosixDigit XPosixDigit \d Decimal digits + graph PosixGraph XPosixGraph Alnum plus Punct + lower PosixLower XPosixLower Lowercase characters + print PosixPrint XPosixPrint Graph plus Print, but + not any Cntrls + punct PosixPunct XPosixPunct Punctuation and Symbols + in ASCII-range; just + punct outside it + space PosixSpace XPosixSpace [\s\cK] + PerlSpace XPerlSpace \s Perl's whitespace def'n + upper PosixUpper XPosixUpper Uppercase characters + word PosixWord XPosixWord \w Alnum + Unicode marks + + connectors, like '_' + (Perl extension) + xdigit ASCII_Hex_Digit XPosixDigit Hexadecimal digit, + ASCII-range is + [0-9A-Fa-f] + +Also, various synonyms like C<\p{Alpha}> for C<\p{XPosixAlpha}>; all listed +in L<perluniprops/Properties accessible through \p{} and \P{}> Within a character class: - POSIX traditional Unicode - [:digit:] \d \p{IsDigit} - [:^digit:] \D \P{IsDigit} + POSIX traditional Unicode + [:digit:] \d \p{Digit} + [:^digit:] \D \P{Digit} =head2 ANCHORS @@ -176,7 +205,6 @@ All are zero-width assertions. \Z Match string end (before optional newline) \z Match absolute string end \G Match where previous m//g left off - \K Keep the stuff left of the \K, don't include it in $& =head2 QUANTIFIERS @@ -222,6 +250,10 @@ There is no quantifier C<{,n}>. That's interpreted as a literal string. (?P>name) Recurse into a named subpattern (python syntax) (?(cond)yes|no) (?(cond)yes) Conditional expression, where "cond" can be: + (?=pat) look-ahead + (?!pat) negative look-ahead + (?<=pat) look-behind + (?<!pat) negative look-behind (N) subpattern N has matched something (<name>) named subpattern has matched something ('name') named subpattern has matched something @@ -257,8 +289,8 @@ specify the C</p> (preserve) modifier on your regular expression. $^R Holds the result of the last (?{...}) expr @- Offsets of starts of groups. $-[0] holds start of whole match @+ Offsets of ends of groups. $+[0] holds end of whole match - %+ Named capture buffers - %- Named capture buffers, as array refs + %+ Named capture groups + %- Named capture groups, as array refs Captured groups are numbered according to their I<opening> paren. @@ -268,6 +300,7 @@ Captured groups are numbered according to their I<opening> paren. lcfirst Lowercase first char of a string uc Uppercase a string ucfirst Titlecase first char of a string + fc Foldcase a string pos Return or set current match position quotemeta Quote metacharacters @@ -276,8 +309,9 @@ Captured groups are numbered according to their I<opening> paren. split Use a regex to split a string into parts -The first four of these are like the escape sequences C<\L>, C<\l>, -C<\U>, and C<\u>. For Titlecase, see L</Titlecase>. +The first five of these are like the escape sequences C<\L>, C<\l>, +C<\U>, C<\u>, and C<\F>. For Titlecase, see L</Titlecase>; For +Foldcase, see L</Foldcase>. =head2 TERMINOLOGY @@ -286,6 +320,12 @@ C<\U>, and C<\u>. For Titlecase, see L</Titlecase>. Unicode concept which most often is equal to uppercase, but for certain characters like the German "sharp s" there is a difference. +=head3 Foldcase + +Unicode form that is useful when comparing strings regardless of case, +as certain characters have compex one-to-many case mappings. Primarily a +variant of lowercase. + =head1 AUTHOR Iain Truskett. Updated by the Perl 5 Porters. @@ -339,7 +379,7 @@ debugging. =item * -L<perldebug/"Debugging regular expressions"> +L<perldebug/"Debugging Regular Expressions"> =item * diff --git a/gnu/usr.bin/perl/pod/perlretut.pod b/gnu/usr.bin/perl/pod/perlretut.pod index 0ff743838c4..a3ff6ad28c4 100644 --- a/gnu/usr.bin/perl/pod/perlretut.pod +++ b/gnu/usr.bin/perl/pod/perlretut.pod @@ -41,7 +41,7 @@ you master the first part, you will have all the tools needed to solve about 98% of your needs. The second part of the tutorial is for those comfortable with the basics and hungry for more power tools. It discusses the more advanced regular expression operators and -introduces the latest cutting edge innovations in 5.6.0. +introduces the latest cutting-edge innovations. A note: to save time, 'regular expression' is often abbreviated as regexp or regex. Regexp is a more natural abbreviation than regex, but @@ -60,7 +60,7 @@ contains that word: "Hello World" =~ /World/; # matches What is this Perl statement all about? C<"Hello World"> is a simple -double quoted string. C<World> is the regular expression and the +double-quoted string. C<World> is the regular expression and the C<//> enclosing C</World/> tells Perl to search a string for a match. The operator C<=~> associates the string with the regexp match and produces a true value if the regexp matched, or false if the regexp @@ -176,7 +176,7 @@ In addition to the metacharacters, there are some ASCII characters which don't have printable character equivalents and are instead represented by I<escape sequences>. Common examples are C<\t> for a tab, C<\n> for a newline, C<\r> for a carriage return and C<\a> for a -bell. If your string is better thought of as a sequence of arbitrary +bell (or alert). If your string is better thought of as a sequence of arbitrary bytes, the octal escape sequence, e.g., C<\033>, or hexadecimal escape sequence, e.g., C<\x1B> may be a more natural representation for your bytes. Here are some examples of escapes: @@ -184,7 +184,8 @@ bytes. Here are some examples of escapes: "1000\t2000" =~ m(0\t2) # matches "1000\n2000" =~ /0\n20/ # matches "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000" - "cat" =~ /\143\x61\x74/ # matches in ASCII, but a weird way to spell cat + "cat" =~ /\o{143}\x61\x74/ # matches in ASCII, but a weird way + # to spell cat If you've been around Perl a while, all this talk of escape sequences may seem familiar. Similar escape sequences are used in double-quoted @@ -286,7 +287,7 @@ Although one can already do quite a lot with the literal string regexps above, we've only scratched the surface of regular expression technology. In this and subsequent sections we will introduce regexp concepts (and associated metacharacter notations) that will allow a -regexp to not just represent a single character sequence, but a I<whole +regexp to represent not just a single character sequence, but a I<whole class> of them. One such concept is that of a I<character class>. A character class @@ -366,8 +367,9 @@ character, or the match fails. Then Now, even C<[0-9]> can be a bother to write multiple times, so in the interest of saving keystrokes and making regexps more readable, Perl has several abbreviations for common character classes, as shown below. -Since the introduction of Unicode, these character classes match more -than just a few characters in the ISO 8859-1 range. +Since the introduction of Unicode, unless the C<//a> modifier is in +effect, these character classes match more than just a few characters in +the ASCII range. =over 4 @@ -401,8 +403,22 @@ but also digits and characters from non-roman scripts The period '.' matches any character but "\n" (unless the modifier C<//s> is in effect, as explained below). +=item * + +\N, like the period, matches any character but "\n", but it does so +regardless of whether the modifier C<//s> is in effect. + =back +The C<//a> modifier, available starting in Perl 5.14, is used to +restrict the matches of \d, \s, and \w to just those in the ASCII range. +It is useful to keep your program from being needlessly exposed to full +Unicode (and its accompanying security considerations) when all you want +is to process English-like text. (The "a" may be doubled, C<//aa>, to +provide even more restrictions, preventing case-insensitive matching of +ASCII with non-ASCII characters; otherwise a Unicode "Kelvin Sign" +would caselessly match a "k" or "K".) + The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside of character classes. Here are some in use: @@ -732,21 +748,21 @@ match). =head2 Backreferences Closely associated with the matching variables C<$1>, C<$2>, ... are -the I<backreferences> C<\1>, C<\2>,... Backreferences are simply +the I<backreferences> C<\g1>, C<\g2>,... Backreferences are simply matching variables that can be used I<inside> a regexp. This is a really nice feature; what matches later in a regexp is made to depend on what matched earlier in the regexp. Suppose we wanted to look for doubled words in a text, like 'the the'. The following regexp finds all 3-letter doubles with a space in between: - /\b(\w\w\w)\s\1\b/; + /\b(\w\w\w)\s\g1\b/; -The grouping assigns a value to \1, so that the same 3 letter sequence +The grouping assigns a value to \g1, so that the same 3-letter sequence is used for both parts. A similar task is to find words consisting of two identical parts: - % simple_grep '^(\w\w\w\w|\w\w\w|\w\w|\w)\1$' /usr/dict/words + % simple_grep '^(\w\w\w\w|\w\w\w|\w\w|\w)\g1$' /usr/dict/words beriberi booboo coco @@ -755,27 +771,27 @@ A similar task is to find words consisting of two identical parts: papa The regexp has a single grouping which considers 4-letter -combinations, then 3-letter combinations, etc., and uses C<\1> to look for -a repeat. Although C<$1> and C<\1> represent the same thing, care should be +combinations, then 3-letter combinations, etc., and uses C<\g1> to look for +a repeat. Although C<$1> and C<\g1> represent the same thing, care should be taken to use matched variables C<$1>, C<$2>,... only I<outside> a regexp -and backreferences C<\1>, C<\2>,... only I<inside> a regexp; not doing +and backreferences C<\g1>, C<\g2>,... only I<inside> a regexp; not doing so may lead to surprising and unsatisfactory results. =head2 Relative backreferences Counting the opening parentheses to get the correct number for a -backreference is errorprone as soon as there is more than one +backreference is error-prone as soon as there is more than one capturing group. A more convenient technique became available with Perl 5.10: relative backreferences. To refer to the immediately preceding capture group one now may write C<\g{-1}>, the next but last is available via C<\g{-2}>, and so on. Another good reason in addition to readability and maintainability -for using relative backreferences is illustrated by the following example, +for using relative backreferences is illustrated by the following example, where a simple pattern for matching peculiar strings is used: - $a99a = '([a-z])(\d)\2\1'; # matches a11a, g22g, x33x, etc. + $a99a = '([a-z])(\d)\g2\g1'; # matches a11a, g22g, x33x, etc. Now that we have this pattern stored as a handy string, we might feel tempted to use it as a part of some other pattern: @@ -799,18 +815,18 @@ using relative backreferences: =head2 Named backreferences -Perl 5.10 also introduced named capture buffers and named backreferences. +Perl 5.10 also introduced named capture groups and named backreferences. To attach a name to a capturing group, you write either C<< (?<name>...) >> or C<< (?'name'...) >>. The backreference may then be written as C<\g{name}>. It is permissible to attach the same name to more than one group, but then only the leftmost one of the eponymous set can be referenced. Outside of the pattern a named -capture buffer is accessible through the C<%+> hash. +capture group is accessible through the C<%+> hash. Assuming that we have to match calendar dates which may be given in one of the three formats yyyy-mm-dd, mm/dd/yyyy or dd.mm.yyyy, we can write three suitable patterns where we use 'd', 'm' and 'y' respectively as the -names of the buffers capturing the pertaining components of a date. The +names of the groups capturing the pertaining components of a date. The matching operation combines the three patterns as alternatives: $fmt1 = '(?<y>\d\d\d\d)-(?<m>\d\d)-(?<d>\d\d)'; @@ -838,7 +854,7 @@ Consider a pattern for matching a time of the day, civil or military style: Processing the results requires an additional if statement to determine whether C<$1> and C<$2> or C<$3> and C<$4> contain the goodies. It would -be easier if we could use buffer numbers 1 and 2 in second alternative as +be easier if we could use group numbers 1 and 2 in second alternative as well, and this is exactly what the parenthesized construct C<(?|...)>, set around an alternative achieves. Here is an extended version of the previous pattern: @@ -847,7 +863,7 @@ previous pattern: print "hour=$1 minute=$2 zone=$3\n"; } -Within the alternative numbering group, buffer numbers start at the same +Within the alternative numbering group, group numbers start at the same position for each alternative. After the group, numbering continues with one higher than the maximum reached across all the alternatives. @@ -896,15 +912,18 @@ C<@+> instead: $& is the same as substr( $x, $-[0], $+[0]-$-[0] ) $' is the same as substr( $x, $+[0] ) +As of Perl 5.10, the C<${^PREMATCH}>, C<${^MATCH}> and C<${^POSTMATCH}> +variables may be used. These are only set if the C</p> modifier is present. +Consequently they do not penalize the rest of the program. =head2 Non-capturing groupings A group that is required to bundle a set of alternatives may or may not be useful as a capturing group. If it isn't, it just creates a superfluous -addition to the set of available capture buffer values, inside as well as +addition to the set of available capture group values, inside as well as outside the regexp. Non-capturing groupings, denoted by C<(?:regexp)>, still allow the regexp to be treated as a single unit, but don't establish -a capturing buffer at the same time. Both capturing and non-capturing +a capturing group at the same time. Both capturing and non-capturing groupings are allowed to co-exist in the same regexp. Because there is no extraction, non-capturing groupings are faster than capturing groupings. Non-capturing groupings are also handy for choosing exactly @@ -924,7 +943,7 @@ elements gathered from a split operation where parentheses are required for some reason: $x = '12aba34ba5'; - @num = split /(a|b)+/, $x; # @num = ('12','a','34','b','5') + @num = split /(a|b)+/, $x; # @num = ('12','a','34','a','5') @num = split /(?:a|b)+/, $x; # @num = ('12','34','5') @@ -976,15 +995,16 @@ Here are some examples: /[a-z]+\s+\d*/; # match a lowercase word, at least one space, and # any number of digits - /(\w+)\s+\1/; # match doubled words of arbitrary length + /(\w+)\s+\g1/; # match doubled words of arbitrary length /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes' - $year =~ /\d{2,4}/; # make sure year is at least 2 but not more - # than 4 digits - $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates - $year =~ /\d{2}(\d{2})?/; # same thing written differently. However, - # this produces $1 and the other does not. - - % simple_grep '^(\w+)\1$' /usr/dict/words # isn't this easier? + $year =~ /^\d{2,4}$/; # make sure year is at least 2 but not more + # than 4 digits + $year =~ /^\d{4}$|^\d{2}$/; # better match; throw out 3-digit dates + $year =~ /^\d{2}(\d{2})?$/; # same thing written differently. However, + # this captures the last two digits in $1 + # and the other does not. + + % simple_grep '^(\w+)\g1$' /usr/dict/words # isn't this easier? beriberi booboo coco @@ -1017,9 +1037,9 @@ stop there, but that wouldn't give the longest possible string to the first quantifier C<.*>. Instead, the first quantifier C<.*> grabs as much of the string as possible while still having the regexp match. In this example, that means having the C<at> sequence with the final C<at> -in the string. The other important principle illustrated here is that +in the string. The other important principle illustrated here is that, when there are two or more elements in a regexp, the I<leftmost> -quantifier, if there is one, gets to grab as much the string as +quantifier, if there is one, gets to grab as much of the string as possible, leaving the rest of the regexp to fight over scraps. Thus in our example, the first quantifier C<.*> grabs most of the string, while the second quantifier C<.*> gets the empty string. Quantifiers that @@ -1417,7 +1437,7 @@ we can rewrite our 'extended' regexp in the more pleasing form If whitespace is mostly irrelevant, how does one include space characters in an extended regexp? The answer is to backslash it S<C<'\ '>> or put it in a character class S<C<[ ]>>. The same thing -goes for pound signs, use C<\#> or C<[#]>. For instance, Perl allows +goes for pound signs: use C<\#> or C<[#]>. For instance, Perl allows a space between the sign and the mantissa or integer, and we could add this to our regexp as follows: @@ -1496,31 +1516,6 @@ single line C<//s>, multi-line C<//m>, case-insensitive C<//i> and extended C<//x> modifiers. There are a few more things you might want to know about matching operators. -=head3 Optimizing pattern evaluation - -We pointed out earlier that variables in regexps are substituted -before the regexp is evaluated: - - $pattern = 'Seuss'; - while (<>) { - print if /$pattern/; - } - -This will print any lines containing the word C<Seuss>. It is not as -efficient as it could be, however, because Perl has to re-evaluate -(or compile) C<$pattern> each time through the loop. If C<$pattern> won't be -changing over the lifetime of the script, we can add the C<//o> -modifier, which directs Perl to only perform variable substitutions -once: - - #!/usr/bin/perl - # Improved simple_grep - $regexp = shift; - while (<>) { - print if /$regexp/o; # a good deal faster - } - - =head3 Prohibiting substitution If you change C<$pattern> after the first substitution happens, Perl @@ -1542,11 +1537,12 @@ the regexp in the I<last successful match> is used instead. So we have =head3 Global matching -The final two modifiers C<//g> and C<//c> concern multiple matches. +The final two modifiers we will discuss here, +C<//g> and C<//c>, concern multiple matches. The modifier C<//g> stands for global matching and allows the matching operator to match within a string as many times as possible. In scalar context, successive invocations against a string will have -`C<//g> jump from match to match, keeping track of position in the +C<//g> jump from match to match, keeping track of position in the string as it goes along. You can get or set the position with the C<pos()> function. @@ -1587,9 +1583,9 @@ there are no groupings, a list of matches to the whole regexp. So if we wanted just the words, we could use @words = ($x =~ /(\w+)/g); # matches, - # $word[0] = 'cat' - # $word[1] = 'dog' - # $word[2] = 'house' + # $words[0] = 'cat' + # $words[1] = 'dog' + # $words[2] = 'house' Closely associated with the C<//g> modifier is the C<\G> anchor. The C<\G> anchor matches at the point where the previous C<//g> match left @@ -1613,7 +1609,7 @@ bit at a time and use arbitrary Perl logic to decide what to do next. Currently, the C<\G> anchor is only fully supported when used to anchor to the start of the pattern. -C<\G> is also invaluable in processing fixed length records with +C<\G> is also invaluable in processing fixed-length records with regexps. Suppose we have a snippet of coding region DNA, encoded as base pair letters C<ATCGTTGAAT...> and we want to find all the stop codons C<TGA>. In a coding region, codons are 3-letter sequences, so @@ -1657,6 +1653,10 @@ which is the correct answer. This example illustrates that it is important not only to match what is desired, but to reject what is not desired. +(There are other regexp modifiers that are available, such as +C<//o>, but their specialized uses are beyond the +scope of this introduction. ) + =head3 Search and replace Regular expressions also play a big role in I<search and replace> @@ -1664,11 +1664,11 @@ operations in Perl. Search and replace is accomplished with the C<s///> operator. The general form is C<s/regexp/replacement/modifiers>, with everything we know about regexps and modifiers applying in this case as well. The -C<replacement> is a Perl double quoted string that replaces in the +C<replacement> is a Perl double-quoted string that replaces in the string whatever is matched with the C<regexp>. The operator C<=~> is also used here to associate a string with C<s///>. If matching against C<$_>, the S<C<$_ =~>> can be dropped. If there is a match, -C<s///> returns the number of substitutions made, otherwise it returns +C<s///> returns the number of substitutions made; otherwise it returns false. Here are a few examples: $x = "Time to feed the cat!"; @@ -1682,7 +1682,7 @@ false. Here are a few examples: In the last example, the whole string was matched, but only the part inside the single quotes was grouped. With the C<s///> operator, the -matched variables C<$1>, C<$2>, etc. are immediately available for use +matched variables C<$1>, C<$2>, etc. are immediately available for use in the replacement expression, so we use C<$1> to replace the quoted string with just what was quoted. With the global modifier, C<s///g> will search and replace all occurrences of the regexp in the string: @@ -1702,7 +1702,7 @@ the following program to replace it: $regexp = shift; $replacement = shift; while (<>) { - s/$regexp/$replacement/go; + s/$regexp/$replacement/g; print; } ^D @@ -1710,13 +1710,41 @@ the following program to replace it: % simple_replace regexp regex perlretut.pod In C<simple_replace> we used the C<s///g> modifier to replace all -occurrences of the regexp on each line and the C<s///o> modifier to -compile the regexp only once. As with C<simple_grep>, both the -C<print> and the C<s/$regexp/$replacement/go> use C<$_> implicitly. +occurrences of the regexp on each line. (Even though the regular +expression appears in a loop, Perl is smart enough to compile it +only once.) As with C<simple_grep>, both the +C<print> and the C<s/$regexp/$replacement/g> use C<$_> implicitly. + +If you don't want C<s///> to change your original variable you can use +the non-destructive substitute modifier, C<s///r>. This changes the +behavior so that C<s///r> returns the final substituted string +(instead of the number of substitutions): + + $x = "I like dogs."; + $y = $x =~ s/dogs/cats/r; + print "$x $y\n"; + +That example will print "I like dogs. I like cats". Notice the original +C<$x> variable has not been affected. The overall +result of the substitution is instead stored in C<$y>. If the +substitution doesn't affect anything then the original string is +returned: + + $x = "I like dogs."; + $y = $x =~ s/elephants/cougars/r; + print "$x $y\n"; # prints "I like dogs. I like dogs." + +One other interesting thing that the C<s///r> flag allows is chaining +substitutions: + + $x = "Cats are great."; + print $x =~ s/Cats/Dogs/r =~ s/Dogs/Frogs/r =~ s/Frogs/Hedgehogs/r, "\n"; + # prints "Hedgehogs are great." A modifier available specifically to search and replace is the -C<s///e> evaluation modifier. C<s///e> wraps an C<eval{...}> around -the replacement string and the evaluated result is substituted for the +C<s///e> evaluation modifier. C<s///e> treats the +replacement text as Perl code, rather than a double-quoted +string. The value that the code returns is substituted for the matched substring. C<s///e> is useful if you need to do a bit of computation in the process of replacing text. This example counts character frequencies in a line: @@ -1740,8 +1768,9 @@ This prints As with the match C<m//> operator, C<s///> can use other delimiters, such as C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are -used C<s'''>, then the regexp and replacement are treated as single -quoted strings and there are no substitutions. C<s///> in list context +used C<s'''>, then the regexp and replacement are +treated as single-quoted strings and there are no +variable substitutions. C<s///> in list context returns the same thing as in scalar context, i.e., the number of matches. @@ -1783,7 +1812,7 @@ an empty initial element to the list. If you have read this far, congratulations! You now have all the basic tools needed to use regular expressions to solve a wide range of text processing problems. If this is your first time through the tutorial, -why not stop here and play around with regexps a while... S<Part 2> +why not stop here and play around with regexps a while.... S<Part 2> concerns the more esoteric aspects of regular expressions and those concepts certainly aren't needed right at the start. @@ -1798,7 +1827,7 @@ too often on a hike, but when we are stuck, they can be invaluable. What follows are the more advanced, less used, or sometimes esoteric capabilities of Perl regexps. In Part 2, we will assume you are -comfortable with the basics and concentrate on the new features. +comfortable with the basics and concentrate on the advanced features. =head2 More on characters, strings, and character classes @@ -1839,21 +1868,27 @@ instance, It does not protect C<$> or C<@>, so that variables can still be substituted. +C<\Q>, C<\L>, C<\l>, C<\U>, C<\u> and C<\E> are actually part of +double-quotish syntax, and not part of regexp syntax proper. They will +work if they appear in a regular expression embedded directly in a +program, but not when contained in a string that is interpolated in a +pattern. + With the advent of 5.6.0, Perl regexps can handle more than just the standard ASCII character set. Perl now supports I<Unicode>, a standard for representing the alphabets from virtually all of the world's written languages, and a host of symbols. Perl's text strings are Unicode strings, so they can contain characters with a value (codepoint or character number) higher -than 255 +than 255. What does this mean for regexps? Well, regexp users don't need to know much about Perl's internal representation of strings. But they do need to know 1) how to represent Unicode characters in a regexp and 2) that a matching operation will treat the string to be searched as a sequence of characters, not bytes. The answer to 1) is that Unicode characters -greater than C<chr(255)> are represented using the C<\x{hex}> notation, -because the \0 octal and \x hex (without curly braces) don't go further -than 255. +greater than C<chr(255)> are represented using the C<\x{hex}> notation, because +\x hex (without curly braces) doesn't go further than 255. (Starting in Perl +5.14, if you're an octal fan, you can also use C<\o{oct}>.) /\x{263a}/; # match a Unicode smiley face :) @@ -1872,30 +1907,36 @@ specified in the Unicode standard. For instance, if we wanted to represent or match the astrological sign for the planet Mercury, we could use - use charnames ":full"; # use named chars with Unicode full names $x = "abc\N{MERCURY}def"; $x =~ /\N{MERCURY}/; # matches -One can also use short names or restrict names to a certain alphabet: +One can also use "short" names: - use charnames ':full'; print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; - - use charnames ":short"; print "\N{greek:Sigma} is an upper-case sigma.\n"; +You can also restrict names to a certain alphabet by specifying the +L<charnames> pragma: + use charnames qw(greek); print "\N{sigma} is Greek sigma\n"; -A list of full names is found in the file NamesList.txt in the -lib/perl5/X.X.X/unicore directory (where X.X.X is the perl -version number as it is installed on your system). - -The answer to requirement 2), as of 5.6.0, is that a regexp uses Unicode -characters. Internally, this is encoded to bytes using either UTF-8 or a -native 8 bit encoding, depending on the history of the string, but -conceptually it is a sequence of characters, not bytes. See -L<perlunitut> for a tutorial about that. +An index of character names is available on-line from the Unicode +Consortium, L<http://www.unicode.org/charts/charindex.html>; explanatory +material with links to other resources at +L<http://www.unicode.org/standard/where>. + +The answer to requirement 2) is, as of 5.6.0, that a regexp (mostly) +uses Unicode characters. (The "mostly" is for messy backward +compatibility reasons, but starting in Perl 5.14, any regex compiled in +the scope of a C<use feature 'unicode_strings'> (which is automatically +turned on within the scope of a C<use 5.012> or higher) will turn that +"mostly" into "always". If you want to handle Unicode properly, you +should ensure that C<'unicode_strings'> is turned on.) +Internally, this is encoded to bytes using either UTF-8 or a native 8 +bit encoding, depending on the history of the string, but conceptually +it is a sequence of characters, not bytes. See L<perlunitut> for a +tutorial about that. Let us now discuss Unicode character classes. Just as with Unicode characters, there are named Unicode character classes represented by the @@ -1903,13 +1944,14 @@ C<\p{name}> escape sequence. Closely associated is the C<\P{name}> character class, which is the negation of the C<\p{name}> class. For example, to match lower and uppercase characters, - use charnames ":full"; # use named chars with Unicode full names $x = "BOB"; $x =~ /^\p{IsUpper}/; # matches, uppercase char class $x =~ /^\P{IsUpper}/; # doesn't match, char class sans uppercase $x =~ /^\p{IsLower}/; # doesn't match, lowercase char class $x =~ /^\P{IsLower}/; # matches, char class sans lowercase +(The "Is" is optional.) + Here is the association between some Perl named classes and the traditional Unicode classes: @@ -1931,21 +1973,18 @@ traditional Unicode classes: IsWord /^[LMN]/ || $code eq "005F" IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/ -You can also use the official Unicode class names with the C<\p> and -C<\P>, like C<\p{L}> for Unicode 'letters', or C<\p{Lu}> for uppercase +You can also use the official Unicode class names with C<\p> and +C<\P>, like C<\p{L}> for Unicode 'letters', C<\p{Lu}> for uppercase letters, or C<\P{Nd}> for non-digits. If a C<name> is just one letter, the braces can be dropped. For instance, C<\pM> is the character class of Unicode 'marks', for example accent marks. For the full list see L<perlunicode>. -The Unicode has also been separated into various sets of characters +Unicode has also been separated into various sets of characters which you can test with C<\p{...}> (in) and C<\P{...}> (not in). To test whether a character is (or is not) an element of a script you would use the script name, for example C<\p{Latin}>, C<\p{Greek}>, -or C<\P{Katakana}>. Other sets are the Unicode blocks, the names -of which begin with "In". One such block is dedicated to mathematical -operators, and its pattern formula is <C\p{InMathematicalOperators>}>. -For the full list see L<perluniprops>. +or C<\P{Katakana}>. What we have described so far is the single form of the C<\p{...}> character classes. There is also a compound form which you may run into. These @@ -1959,7 +1998,7 @@ never have to use the compound forms, but sometimes it is necessary, and their use can make your code easier to understand. C<\X> is an abbreviation for a character class that comprises -a Unicode I<extended grapheme cluster>. This represents a "logical character", +a Unicode I<extended grapheme cluster>. This represents a "logical character": what appears to be a single character, but may be represented internally by more than one. As an example, using the Unicode full names, e.g., S<C<A + COMBINING RING>> is a grapheme cluster with base character C<A> and combining character @@ -1969,27 +2008,27 @@ as in the word Angstrom. For the full and latest information about Unicode see the latest Unicode standard, or the Unicode Consortium's website L<http://www.unicode.org> -As if all those classes weren't enough, Perl also defines POSIX style +As if all those classes weren't enough, Perl also defines POSIX-style character classes. These have the form C<[:name:]>, with C<name> the name of the POSIX class. The POSIX classes are C<alpha>, C<alnum>, C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>, C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl -extension to match C<\w>), and C<blank> (a GNU extension). If C<utf8> -is being used, then these classes are defined the same as their -corresponding Perl Unicode classes: C<[:upper:]> is the same as -C<\p{IsUpper}>, etc. The POSIX character classes, however, don't -require using C<utf8>. The C<[:digit:]>, C<[:word:]>, and +extension to match C<\w>), and C<blank> (a GNU extension). The C<//a> +modifier restricts these to matching just in the ASCII range; otherwise +they can match the same as their corresponding Perl Unicode classes: +C<[:upper:]> is the same as C<\p{IsUpper}>, etc. (There are some +exceptions and gotchas with this; see L<perlrecharclass> for a full +discussion.) The C<[:digit:]>, C<[:word:]>, and C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s> character classes. To negate a POSIX class, put a C<^> in front of -the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under -C<utf8>, C<\P{IsDigit}>. The Unicode and POSIX character classes can +the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and, under +Unicode, C<\P{IsDigit}>. The Unicode and POSIX character classes can be used just like C<\d>, with the exception that POSIX character classes can only be used inside of a character class: /\s+[abc[:digit:]xyz]\s*/; # match a,b,c,x,y,z, or a digit /^=item\s[[:digit:]]/; # match '=item', # followed by a space and a digit - use charnames ":full"; /\s+[abc\p{IsDigit}xyz]\s+/; # match a,b,c,x,y,z, or a digit /^=item\s\p{IsDigit}/; # match '=item', # followed by a space and a digit @@ -1998,8 +2037,8 @@ Whew! That is all the rest of the characters and character classes. =head2 Compiling and saving regular expressions -In Part 1 we discussed the C<//o> modifier, which compiles a regexp -just once. This suggests that a compiled regexp is some data structure +In Part 1 we mentioned that Perl compiles a regexp into a compact +sequence of opcodes. Thus, a compiled regexp is a data structure that can be stored once and used again and again. The regexp quote C<qr//> does exactly that: C<qr/string/> compiles the C<string> as a regexp and transforms the result into a form that can be assigned to a @@ -2074,7 +2113,7 @@ multiple patterns: $pattern = join '|', @regexp; while ($line = <>) { - print $line if $line =~ /$pattern/o; + print $line if $line =~ /$pattern/; } ^D @@ -2128,8 +2167,8 @@ Starting with this section, we will be discussing Perl's set of I<extended patterns>. These are extensions to the traditional regular expression syntax that provide powerful new tools for pattern matching. We have already seen extensions in the form of the minimal -matching constructs C<??>, C<*?>, C<+?>, C<{n,m}?>, and C<{n,}?>. The -rest of the extensions below have the form C<(?char...)>, where the +matching constructs C<??>, C<*?>, C<+?>, C<{n,m}?>, and C<{n,}?>. Most +of the extensions below have the form C<(?char...)>, where the C<char> is a character that determines the type of extension. The first extension is an embedded comment C<(?#text)>. This embeds a @@ -2142,7 +2181,7 @@ example is This style of commenting has been largely superseded by the raw, freeform commenting that is allowed with the C<//x> modifier. -The modifiers C<//i>, C<//m>, C<//s> and C<//x> (or any +Most modifiers, such as C<//i>, C<//m>, C<//s> and C<//x> (or any combination thereof) can also be embedded in a regexp using C<(?i)>, C<(?m)>, C<(?s)>, and C<(?x)>. For instance, @@ -2200,8 +2239,8 @@ we have seen so far are the anchors. The anchor C<^> matches the beginning of the line, but doesn't eat any characters. Similarly, the word boundary anchor C<\b> matches wherever a character matching C<\w> is next to a character that doesn't, but it doesn't eat up any -characters itself. Anchors are examples of I<zero-width assertions>. -Zero-width, because they consume +characters itself. Anchors are examples of I<zero-width assertions>: +zero-width, because they consume no characters, and assertions, because they test some property of the string. In the context of our walk in the woods analogy to regexp matching, most regexp elements move us along a trail, but anchors have @@ -2347,9 +2386,9 @@ matched, otherwise the C<no-regexp> will be matched. The C<condition> can have several forms. The first form is simply an integer in parentheses C<(integer)>. It is true if the corresponding backreference C<\integer> matched earlier in the regexp. The same -thing can be done with a name associated with a capture buffer, written +thing can be done with a name associated with a capture group, written as C<< (<name>) >> or C<< ('name') >>. The second form is a bare -zero width assertion C<(?...)>, either a lookahead, a lookbehind, or a +zero-width assertion C<(?...)>, either a lookahead, a lookbehind, or a code assertion (discussed in the next section). The third set of forms provides tests that return true if the expression is executed within a recursion (C<(R)>) or is being called from some capturing group, @@ -2360,7 +2399,7 @@ The integer or name form of the C<condition> allows us to choose, with more flexibility, what to match based on what matched earlier in the regexp. This searches for words of the form C<"$x$x"> or C<"$x$y$y$x">: - % simple_grep '^(\w+)(\w+)?(?(2)\2\1|\1)$' /usr/dict/words + % simple_grep '^(\w+)(\w+)?(?(2)\g2\g1|\g1)$' /usr/dict/words beriberi coco couscous @@ -2441,8 +2480,8 @@ have the full pattern: In C<(?...)> both absolute and relative backreferences may be used. The entire pattern can be reinserted with C<(?R)> or C<(?0)>. -If you prefer to name your buffers, you can use C<(?&name)> to -recurse into that buffer. +If you prefer to name your groups, you can use C<(?&name)> to +recurse into that group. =head2 A bit of magic: executing Perl code in a regular expression @@ -2683,28 +2722,29 @@ detailed description. Below is just one example, illustrating the control verb C<(*FAIL)>, which may be abbreviated as C<(*F)>. If this is inserted in a regexp -it will cause to fail, just like at some mismatch between the pattern -and the string. Processing of the regexp continues like after any "normal" +it will cause it to fail, just as it would at some +mismatch between the pattern and the string. Processing +of the regexp continues as it would after any "normal" failure, so that, for instance, the next position in the string or another alternative will be tried. As failing to match doesn't preserve capture -buffers or produce results, it may be necessary to use this in +groups or produce results, it may be necessary to use this in combination with embedded code. %count = (); - "supercalifragilisticexpialidoceous" =~ - /([aeiou])(?{ $count{$1}++; })(*FAIL)/oi; + "supercalifragilisticexpialidocious" =~ + /([aeiou])(?{ $count{$1}++; })(*FAIL)/i; printf "%3d '%s'\n", $count{$_}, $_ for (sort keys %count); The pattern begins with a class matching a subset of letters. Whenever this matches, a statement like C<$count{'a'}++;> is executed, incrementing the letter's counter. Then C<(*FAIL)> does what it says, and -the regexp engine proceeds according to the book: as long as the end of -the string hasn't been reached, the position is advanced before looking +the regexp engine proceeds according to the book: as long as the end of +the string hasn't been reached, the position is advanced before looking for another vowel. Thus, match or no match makes no difference, and the regexp engine proceeds until the entire string has been inspected. (It's remarkable that an alternative solution using something like - $count{lc($_)}++ for split('', "supercalifragilisticexpialidoceous"); + $count{lc($_)}++ for split('', "supercalifragilisticexpialidocious"); printf "%3d '%s'\n", $count2{$_}, $_ for ( qw{ a e i o u } ); is considerably slower.) @@ -2730,6 +2770,15 @@ performing some other processing. Both C<taint> and C<eval> pragmas are lexically scoped, which means they are in effect only until the end of the block enclosing the pragmas. + use re '/m'; # or any other flags + $multiline_string =~ /^foo/; # /m is implied + +The C<re '/flags'> pragma (introduced in Perl +5.14) turns on the given regular expression flags +until the end of the lexical scope. See +L<re/"'E<sol>flags' mode"> for more +detail. + use re 'debug'; /^(.*)$/s; # output debugging info @@ -2743,7 +2792,7 @@ information is displayed in color on terminals that can display termcap color sequences. Here is example output: % perl -e 'use re "debug"; "abc" =~ /a*b+c/;' - Compiling REx `a*b+c' + Compiling REx 'a*b+c' size 9 first at 1 1: STAR(4) 2: EXACT <a>(0) @@ -2751,11 +2800,11 @@ termcap color sequences. Here is example output: 5: EXACT <b>(0) 7: EXACT <c>(9) 9: END(0) - floating `bc' at 0..2147483647 (checking floating) minlen 2 - Guessing start of match, REx `a*b+c' against `abc'... - Found floating substr `bc' at offset 1... + floating 'bc' at 0..2147483647 (checking floating) minlen 2 + Guessing start of match, REx 'a*b+c' against 'abc'... + Found floating substr 'bc' at offset 1... Guessed: match at offset 0 - Matching REx `a*b+c' against `abc' + Matching REx 'a*b+c' against 'abc' Setting an EVAL scope, savestack=3 0 <> <abc> | 1: STAR EXACT <a> can match 1 times out of 32767... @@ -2766,13 +2815,13 @@ termcap color sequences. Here is example output: 2 <ab> <c> | 7: EXACT <c> 3 <abc> <> | 9: END Match successful! - Freeing REx: `a*b+c' + Freeing REx: 'a*b+c' If you have gotten this far into the tutorial, you can probably guess what the different parts of the debugging output tell you. The first part - Compiling REx `a*b+c' + Compiling REx 'a*b+c' size 9 first at 1 1: STAR(4) 2: EXACT <a>(0) @@ -2786,15 +2835,15 @@ starred object, in this case C<'a'>, and if it matches, goto line 4, i.e., C<PLUS(7)>. The middle lines describe some heuristics and optimizations performed before a match: - floating `bc' at 0..2147483647 (checking floating) minlen 2 - Guessing start of match, REx `a*b+c' against `abc'... - Found floating substr `bc' at offset 1... + floating 'bc' at 0..2147483647 (checking floating) minlen 2 + Guessing start of match, REx 'a*b+c' against 'abc'... + Found floating substr 'bc' at offset 1... Guessed: match at offset 0 Then the match is executed and the remaining lines describe the process: - Matching REx `a*b+c' against `abc' + Matching REx 'a*b+c' against 'abc' Setting an EVAL scope, savestack=3 0 <> <abc> | 1: STAR EXACT <a> can match 1 times out of 32767... @@ -2805,13 +2854,13 @@ process: 2 <ab> <c> | 7: EXACT <c> 3 <abc> <> | 9: END Match successful! - Freeing REx: `a*b+c' + Freeing REx: 'a*b+c' Each step is of the form S<C<< n <x> <y> >>>, with C<< <x> >> the part of the string matched and C<< <y> >> the part not yet matched. The S<C<< | 1: STAR >>> says that Perl is at line number 1 -n the compilation list above. See -L<perldebguts/"Debugging regular expressions"> for much more detail. +in the compilation list above. See +L<perldebguts/"Debugging Regular Expressions"> for much more detail. An alternative method of debugging regexps is to embed C<print> statements within the regexp. This provides a blow-by-blow account of diff --git a/gnu/usr.bin/perl/pod/perlsource.pod b/gnu/usr.bin/perl/pod/perlsource.pod new file mode 100644 index 00000000000..16252eb3f07 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlsource.pod @@ -0,0 +1,223 @@ +=encoding utf8 + +=for comment +Consistent formatting of this file is achieved with: + perl ./Porting/podtidy pod/perlsource.pod + +=head1 NAME + +perlsource - A guide to the Perl source tree + +=head1 DESCRIPTION + +This document describes the layout of the Perl source tree. If you're +hacking on the Perl core, this will help you find what you're looking +for. + +=head1 FINDING YOUR WAY AROUND + +The Perl source tree is big. Here's some of the thing you'll find in +it: + +=head2 C code + +The C source code and header files mostly live in the root of the +source tree. There are a few platform-specific directories which +contain C code. In addition, some of the modules shipped with Perl +include C or XS code. + +See L<perlinterp> for more details on the files that make up the Perl +interpreter, as well as details on how it works. + +=head2 Core modules + +Modules shipped as part of the Perl core live in four subdirectories. +Two of these directories contain modules that live in the core, and two +contain modules that can also be released separately on CPAN. Modules +which can be released on cpan are known as "dual-life" modules. + +=over 4 + +=item * F<lib/> + +This directory contains pure-Perl modules which are only released as +part of the core. This directory contains I<all> of the modules and +their tests, unlike other core modules. + +=item * F<ext/> + +This directory contains XS-using modules which are only released as +part of the core. These modules generally have their F<Makefile.PL> and +are laid out more like a typical CPAN module. + +=item * F<dist/> + +This directory is for dual-life modules where the blead source is +canonical. Note that some modules in this directory may not yet have +been released separately on CPAN. + +=item * F<cpan/> + +This directory contains dual-life modules where the CPAN module is +canonical. Do not patch these modules directly! Changes to these +modules should be submitted to the maintainer of the CPAN module. Once +those changes are applied and released, the new version of the module +will be incorporated into the core. + +=back + +For some dual-life modules, it has not yet been determined if the CPAN +version or the blead source is canonical. Until that is done, those +modules should be in F<cpan/>. + +=head2 Tests + +The Perl core has an extensive test suite. If you add new tests (or new +modules with tests), you may need to update the F<t/TEST> file so that +the tests are run. + +=over 4 + +=item * Module tests + +Tests for core modules in the F<lib/> directory are right next to the +module itself. For example, we have F<lib/strict.pm> and +F<lib/strict.t>. + +Tests for modules in F<ext/> and the dual-life modules are in F<t/> +subdirectories for each module, like a standard CPAN distribution. + +=item * F<t/base/> + +Tests for the absolute basic functionality of Perl. This includes +C<if>, basic file reads and writes, simple regexes, etc. These are run +first in the test suite and if any of them fail, something is I<really> +broken. + +=item * F<t/cmd/> + +Tests for basic control structures, C<if/else>, C<while>, subroutines, +etc. + +=item * F<t/comp/> + +Tests for basic issues of how Perl parses and compiles itself. + +=item * F<t/io/> + +Tests for built-in IO functions, including command line arguments. + +=item * F<t/mro/> + +Tests for perl's method resolution order implementations (see L<mro>). + +=item * F<t/op/> + +Tests for perl's built in functions that don't fit into any of the +other directories. + +=item * F<t/re/> + +Tests for regex related functions or behaviour. (These used to live in +t/op). + +=item * F<t/run/> + +Tests for features of how perl actually runs, including exit codes and +handling of PERL* environment variables. + +=item * F<t/uni/> + +Tests for the core support of Unicode. + +=item * F<t/win32/> + +Windows-specific tests. + +=item * F<t/porting/> + +Tests the state of the source tree for various common errors. For +example, it tests that everyone who is listed in the git log has a +corresponding entry in the F<AUTHORS> file. + +=item * F<t/lib/> + +The old home for the module tests, you shouldn't put anything new in +here. There are still some bits and pieces hanging around in here that +need to be moved. Perhaps you could move them? Thanks! + +=item * F<t/x2p> + +A test suite for the s2p converter. + +=back + +=head2 Documentation + +All of the core documentation intended for end users lives in F<pod/>. +Individual modules in F<lib/>, F<ext/>, F<dist/>, and F<cpan/> usually +have their own documentation, either in the F<Module.pm> file or an +accompanying F<Module.pod> file. + +Finally, documentation intended for core Perl developers lives in the +F<Porting/> directory. + +=head2 Hacking tools and documentation + +The F<Porting> directory contains a grab bag of code and documentation +intended to help porters work on Perl. Some of the highlights include: + +=over 4 + +=item * F<check*> + +These are scripts which will check the source things like ANSI C +violations, POD encoding issues, etc. + +=item * F<Maintainers>, F<Maintainers.pl>, and F<Maintainers.pm> + +These files contain information on who maintains which modules. Run +C<perl Porting/Maintainers -M Module::Name> to find out more +information about a dual-life module. + +=item * F<podtidy> + +Tidies a pod file. It's a good idea to run this on a pod file you've +patched. + +=back + +=head2 Build system + +The Perl build system starts with the F<Configure> script in the root +directory. + +Platform-specific pieces of the build system also live in +platform-specific directories like F<win32/>, F<vms/>, etc. + +The F<Configure> script is ultimately responsible for generating a +F<Makefile>. + +The build system that Perl uses is called metaconfig. This system is +maintained separately from the Perl core. + +The metaconfig system has its own git repository. Please see its README +file in L<http://perl5.git.perl.org/metaconfig.git/> for more details. + +The F<Cross> directory contains various files related to +cross-compiling Perl. See F<Cross/README> for more details. + +=head2 F<AUTHORS> + +This file lists everyone who's contributed to Perl. If you submit a +patch, you should add your name to this file as part of the patch. + +=head2 F<MANIFEST> + +The F<MANIFEST> file in the root of the source tree contains a list of +every file in the Perl core, as well as a brief description of each +file. + +You can get an overview of all the files with this command: + + % perl -lne 'print if /^[^\/]+\.[ch]\s+/' MANIFEST diff --git a/gnu/usr.bin/perl/pod/perltodo.pod b/gnu/usr.bin/perl/pod/perltodo.pod index 0a03bf41752..524a501681f 100644 --- a/gnu/usr.bin/perl/pod/perltodo.pod +++ b/gnu/usr.bin/perl/pod/perltodo.pod @@ -4,1299 +4,7 @@ perltodo - Perl TO-DO List =head1 DESCRIPTION -This is a list of wishes for Perl. The most up to date version of this file -is at http://perl5.git.perl.org/perl.git/blob_plain/HEAD:/pod/perltodo.pod - -The tasks we think are smaller or easier are listed first. Anyone is welcome -to work on any of these, but it's a good idea to first contact -I<perl5-porters@perl.org> to avoid duplication of effort, and to learn from -any previous attempts. By all means contact a pumpking privately first if you -prefer. - -Whilst patches to make the list shorter are most welcome, ideas to add to -the list are also encouraged. Check the perl5-porters archives for past -ideas, and any discussion about them. One set of archives may be found at: - - http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/ - -What can we offer you in return? Fame, fortune, and everlasting glory? Maybe -not, but if your patch is incorporated, then we'll add your name to the -F<AUTHORS> file, which ships in the official distribution. How many other -programming languages offer you 1 line of immortality? - -=head1 Tasks that only need Perl knowledge - -=head2 Improve Porting/cmpVERSION.pl to work from git tags - -See F<Porting/release_managers_guide.pod> for a bit more detail. - -=head2 Migrate t/ from custom TAP generation - -Many tests below F<t/> still generate TAP by "hand", rather than using library -functions. As explained in L<perlhack/Writing a test>, tests in F<t/> are -written in a particular way to test that more complex constructions actually -work before using them routinely. Hence they don't use C<Test::More>, but -instead there is an intentionally simpler library, F<t/test.pl>. However, -quite a few tests in F<t/> have not been refactored to use it. Refactoring -any of these tests, one at a time, is a useful thing TODO. - -The subdirectories F<base>, F<cmd> and F<comp>, that contain the most -basic tests, should be excluded from this task. - -=head2 Test that regen.pl was run - -There are various generated files shipped with the perl distribution, for -things like header files generate from data. The generation scripts are -written in perl, and all can be run by F<regen.pl>. However, because they're -written in perl, we can't run them before we've built perl. We can't run them -as part of the F<Makefile>, because changing files underneath F<make> confuses -it completely, and we don't want to run them automatically anyway, as they -change files shipped by the distribution, something we seek not do to. - -If someone changes the data, but forgets to re-run F<regen.pl> then the -generated files are out of sync. It would be good to have a test in -F<t/porting> that checks that the generated files are in sync, and fails -otherwise, to alert someone before they make a poor commit. I suspect that this -would require adapting the scripts run from F<regen.pl> to have dry-run -options, and invoking them with these, or by refactoring them into a library -that does the generation, which can be called by the scripts, and by the test. - -=head2 Automate perldelta generation - -The perldelta file accompanying each release summaries the major changes. -It's mostly manually generated currently, but some of that could be -automated with a bit of perl, specifically the generation of - -=over - -=item Modules and Pragmata - -=item New Documentation - -=item New Tests - -=back - -See F<Porting/how_to_write_a_perldelta.pod> for details. - -=head2 Remove duplication of test setup. - -Schwern notes, that there's duplication of code - lots and lots of tests have -some variation on the big block of C<$Is_Foo> checks. We can safely put this -into a file, change it to build an C<%Is> hash and require it. Maybe just put -it into F<test.pl>. Throw in the handy tainting subroutines. - -=head2 POD -E<gt> HTML conversion in the core still sucks - -Which is crazy given just how simple POD purports to be, and how simple HTML -can be. It's not actually I<as> simple as it sounds, particularly with the -flexibility POD allows for C<=item>, but it would be good to improve the -visual appeal of the HTML generated, and to avoid it having any validation -errors. See also L</make HTML install work>, as the layout of installation tree -is needed to improve the cross-linking. - -The addition of C<Pod::Simple> and its related modules may make this task -easier to complete. - -=head2 Make ExtUtils::ParseXS use strict; - -F<lib/ExtUtils/ParseXS.pm> contains this line - - # use strict; # One of these days... - -Simply uncomment it, and fix all the resulting issues :-) - -The more practical approach, to break the task down into manageable chunks, is -to work your way though the code from bottom to top, or if necessary adding -extra C<{ ... }> blocks, and turning on strict within them. - -=head2 Make Schwern poorer - -We should have tests for everything. When all the core's modules are tested, -Schwern has promised to donate to $500 to TPF. We may need volunteers to -hold him upside down and shake vigorously in order to actually extract the -cash. - -=head2 Improve the coverage of the core tests - -Use Devel::Cover to ascertain the core modules' test coverage, then add -tests that are currently missing. - -=head2 test B - -A full test suite for the B module would be nice. - -=head2 A decent benchmark - -C<perlbench> seems impervious to any recent changes made to the perl core. It -would be useful to have a reasonable general benchmarking suite that roughly -represented what current perl programs do, and measurably reported whether -tweaks to the core improve, degrade or don't really affect performance, to -guide people attempting to optimise the guts of perl. Gisle would welcome -new tests for perlbench. - -=head2 fix tainting bugs - -Fix the bugs revealed by running the test suite with the C<-t> switch (via -C<make test.taintwarn>). - -=head2 Dual life everything - -As part of the "dists" plan, anything that doesn't belong in the smallest perl -distribution needs to be dual lifed. Anything else can be too. Figure out what -changes would be needed to package that module and its tests up for CPAN, and -do so. Test it with older perl releases, and fix the problems you find. - -To make a minimal perl distribution, it's useful to look at -F<t/lib/commonsense.t>. - -=head2 Move dual-life pod/*.PL into ext - -Nearly all the dual-life modules have been moved to F<ext>. However, we -still need to move F<pod/*.PL> into their respective directories -in F<ext/>. They're referenced by (at least) C<plextract> in F<Makefile.SH> -and C<utils> in F<win32/Makefile> and F<win32/makefile.ml>, and listed -explicitly in F<win32/pod.mak>, F<vms/descrip_mms.template> and F<utils.lst> - -=head2 POSIX memory footprint - -Ilya observed that use POSIX; eats memory like there's no tomorrow, and at -various times worked to cut it down. There is probably still fat to cut out - -for example POSIX passes Exporter some very memory hungry data structures. - -=head2 embed.pl/makedef.pl - -There is a script F<embed.pl> that generates several header files to prefix -all of Perl's symbols in a consistent way, to provide some semblance of -namespace support in C<C>. Functions are declared in F<embed.fnc>, variables -in F<interpvar.h>. Quite a few of the functions and variables -are conditionally declared there, using C<#ifdef>. However, F<embed.pl> -doesn't understand the C macros, so the rules about which symbols are present -when is duplicated in F<makedef.pl>. Writing things twice is bad, m'kay. -It would be good to teach C<embed.pl> to understand the conditional -compilation, and hence remove the duplication, and the mistakes it has caused. - -=head2 use strict; and AutoLoad - -Currently if you write - - package Whack; - use AutoLoader 'AUTOLOAD'; - use strict; - 1; - __END__ - sub bloop { - print join (' ', No, strict, here), "!\n"; - } - -then C<use strict;> isn't in force within the autoloaded subroutines. It would -be more consistent (and less surprising) to arrange for all lexical pragmas -in force at the __END__ block to be in force within each autoloaded subroutine. - -There's a similar problem with SelfLoader. - -=head2 profile installman - -The F<installman> script is slow. All it is doing text processing, which we're -told is something Perl is good at. So it would be nice to know what it is doing -that is taking so much CPU, and where possible address it. - -=head2 enable lexical enabling/disabling of inidvidual warnings - -Currently, warnings can only be enabled or disabled by category. There -are times when it would be useful to quash a single warning, not a -whole category. - -=head1 Tasks that need a little sysadmin-type knowledge - -Or if you prefer, tasks that you would learn from, and broaden your skills -base... - -=head2 make HTML install work - -There is an C<installhtml> target in the Makefile. It's marked as -"experimental". It would be good to get this tested, make it work reliably, and -remove the "experimental" tag. This would include - -=over 4 - -=item 1 - -Checking that cross linking between various parts of the documentation works. -In particular that links work between the modules (files with POD in F<lib/>) -and the core documentation (files in F<pod/>) - -=item 2 - -Work out how to split C<perlfunc> into chunks, preferably one per function -group, preferably with general case code that could be used elsewhere. -Challenges here are correctly identifying the groups of functions that go -together, and making the right named external cross-links point to the right -page. Things to be aware of are C<-X>, groups such as C<getpwnam> to -C<endservent>, two or more C<=items> giving the different parameter lists, such -as - - =item substr EXPR,OFFSET,LENGTH,REPLACEMENT - =item substr EXPR,OFFSET,LENGTH - =item substr EXPR,OFFSET - -and different parameter lists having different meanings. (eg C<select>) - -=back - -=head2 compressed man pages - -Be able to install them. This would probably need a configure test to see how -the system does compressed man pages (same directory/different directory? -same filename/different filename), as well as tweaking the F<installman> script -to compress as necessary. - -=head2 Add a code coverage target to the Makefile - -Make it easy for anyone to run Devel::Cover on the core's tests. The steps -to do this manually are roughly - -=over 4 - -=item * - -do a normal C<Configure>, but include Devel::Cover as a module to install -(see F<INSTALL> for how to do this) - -=item * - - make perl - -=item * - - cd t; HARNESS_PERL_SWITCHES=-MDevel::Cover ./perl -I../lib harness - -=item * - -Process the resulting Devel::Cover database - -=back - -This just give you the coverage of the F<.pm>s. To also get the C level -coverage you need to - -=over 4 - -=item * - -Additionally tell C<Configure> to use the appropriate C compiler flags for -C<gcov> - -=item * - - make perl.gcov - -(instead of C<make perl>) - -=item * - -After running the tests run C<gcov> to generate all the F<.gcov> files. -(Including down in the subdirectories of F<ext/> - -=item * - -(From the top level perl directory) run C<gcov2perl> on all the C<.gcov> files -to get their stats into the cover_db directory. - -=item * - -Then process the Devel::Cover database - -=back - -It would be good to add a single switch to C<Configure> to specify that you -wanted to perform perl level coverage, and another to specify C level -coverage, and have C<Configure> and the F<Makefile> do all the right things -automatically. - -=head2 Make Config.pm cope with differences between built and installed perl - -Quite often vendors ship a perl binary compiled with their (pay-for) -compilers. People install a free compiler, such as gcc. To work out how to -build extensions, Perl interrogates C<%Config>, so in this situation -C<%Config> describes compilers that aren't there, and extension building -fails. This forces people into choosing between re-compiling perl themselves -using the compiler they have, or only using modules that the vendor ships. - -It would be good to find a way teach C<Config.pm> about the installation setup, -possibly involving probing at install time or later, so that the C<%Config> in -a binary distribution better describes the installed machine, when the -installed machine differs from the build machine in some significant way. - -=head2 linker specification files - -Some platforms mandate that you provide a list of a shared library's external -symbols to the linker, so the core already has the infrastructure in place to -do this for generating shared perl libraries. My understanding is that the -GNU toolchain can accept an optional linker specification file, and restrict -visibility just to symbols declared in that file. It would be good to extend -F<makedef.pl> to support this format, and to provide a means within -C<Configure> to enable it. This would allow Unix users to test that the -export list is correct, and to build a perl that does not pollute the global -namespace with private symbols. - -=head2 Cross-compile support - -Currently C<Configure> understands C<-Dusecrosscompile> option. This option -arranges for building C<miniperl> for TARGET machine, so this C<miniperl> is -assumed then to be copied to TARGET machine and used as a replacement of full -C<perl> executable. - -This could be done little differently. Namely C<miniperl> should be built for -HOST and then full C<perl> with extensions should be compiled for TARGET. -This, however, might require extra trickery for %Config: we have one config -first for HOST and then another for TARGET. Tools like MakeMaker will be -mightily confused. Having around two different types of executables and -libraries (HOST and TARGET) makes life interesting for Makefiles and -shell (and Perl) scripts. There is $Config{run}, normally empty, which -can be used as an execution wrapper. Also note that in some -cross-compilation/execution environments the HOST and the TARGET do -not see the same filesystem(s), the $Config{run} may need to do some -file/directory copying back and forth. - -=head2 roffitall - -Make F<pod/roffitall> be updated by F<pod/buildtoc>. - -=head2 Split "linker" from "compiler" - -Right now, Configure probes for two commands, and sets two variables: - -=over 4 - -=item * C<cc> (in F<cc.U>) - -This variable holds the name of a command to execute a C compiler which -can resolve multiple global references that happen to have the same -name. Usual values are F<cc> and F<gcc>. -Fervent ANSI compilers may be called F<c89>. AIX has F<xlc>. - -=item * C<ld> (in F<dlsrc.U>) - -This variable indicates the program to be used to link -libraries for dynamic loading. On some systems, it is F<ld>. -On ELF systems, it should be C<$cc>. Mostly, we'll try to respect -the hint file setting. - -=back - -There is an implicit historical assumption from around Perl5.000alpha -something, that C<$cc> is also the correct command for linking object files -together to make an executable. This may be true on Unix, but it's not true -on other platforms, and there are a maze of work arounds in other places (such -as F<Makefile.SH>) to cope with this. - -Ideally, we should create a new variable to hold the name of the executable -linker program, probe for it in F<Configure>, and centralise all the special -case logic there or in hints files. - -A small bikeshed issue remains - what to call it, given that C<$ld> is already -taken (arguably for the wrong thing now, but on SunOS 4.1 it is the command -for creating dynamically-loadable modules) and C<$link> could be confused with -the Unix command line executable of the same name, which does something -completely different. Andy Dougherty makes the counter argument "In parrot, I -tried to call the command used to link object files and libraries into an -executable F<link>, since that's what my vaguely-remembered DOS and VMS -experience suggested. I don't think any real confusion has ensued, so it's -probably a reasonable name for perl5 to use." - -"Alas, I've always worried that introducing it would make things worse, -since now the module building utilities would have to look for -C<$Config{link}> and institute a fall-back plan if it weren't found." -Although I can see that as confusing, given that C<$Config{d_link}> is true -when (hard) links are available. - -=head2 Configure Windows using PowerShell - -Currently, Windows uses hard-coded config files based to build the -config.h for compiling Perl. Makefiles are also hard-coded and need to be -hand edited prior to building Perl. While this makes it easy to create a perl.exe -that works across multiple Windows versions, being able to accurately -configure a perl.exe for a specific Windows versions and VS C++ would be -a nice enhancement. With PowerShell available on Windows XP and up, this -may now be possible. Step 1 might be to investigate whether this is possible -and use this to clean up our current makefile situation. Step 2 would be to -see if there would be a way to use our existing metaconfig units to configure a -Windows Perl or whether we go in a separate direction and make it so. Of -course, we all know what step 3 is. - -=head2 decouple -g and -DDEBUGGING - -Currently F<Configure> automatically adds C<-DDEBUGGING> to the C compiler -flags if it spots C<-g> in the optimiser flags. The pre-processor directive -C<DEBUGGING> enables F<perl>'s command line C<-D> options, but in the process -makes F<perl> slower. It would be good to disentangle this logic, so that -C-level debugging with C<-g> and Perl level debugging with C<-D> can easily -be enabled independently. - -=head1 Tasks that need a little C knowledge - -These tasks would need a little C knowledge, but don't need any specific -background or experience with XS, or how the Perl interpreter works - -=head2 Weed out needless PERL_UNUSED_ARG - -The C code uses the macro C<PERL_UNUSED_ARG> to stop compilers warning about -unused arguments. Often the arguments can't be removed, as there is an -external constraint that determines the prototype of the function, so this -approach is valid. However, there are some cases where C<PERL_UNUSED_ARG> -could be removed. Specifically - -=over 4 - -=item * - -The prototypes of (nearly all) static functions can be changed - -=item * - -Unused arguments generated by short cut macros are wasteful - the short cut -macro used can be changed. - -=back - -=head2 Modernize the order of directories in @INC - -The way @INC is laid out by default, one cannot upgrade core (dual-life) -modules without overwriting files. This causes problems for binary -package builders. One possible proposal is laid out in this -message: -L<http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2002-04/msg02380.html>. - -=head2 -Duse32bit* - -Natively 64-bit systems need neither -Duse64bitint nor -Duse64bitall. -On these systems, it might be the default compilation mode, and there -is currently no guarantee that passing no use64bitall option to the -Configure process will build a 32bit perl. Implementing -Duse32bit* -options would be nice for perl 5.12. - -=head2 Profile Perl - am I hot or not? - -The Perl source code is stable enough that it makes sense to profile it, -identify and optimise the hotspots. It would be good to measure the -performance of the Perl interpreter using free tools such as cachegrind, -gprof, and dtrace, and work to reduce the bottlenecks they reveal. - -As part of this, the idea of F<pp_hot.c> is that it contains the I<hot> ops, -the ops that are most commonly used. The idea is that by grouping them, their -object code will be adjacent in the executable, so they have a greater chance -of already being in the CPU cache (or swapped in) due to being near another op -already in use. - -Except that it's not clear if these really are the most commonly used ops. So -as part of exercising your skills with coverage and profiling tools you might -want to determine what ops I<really> are the most commonly used. And in turn -suggest evictions and promotions to achieve a better F<pp_hot.c>. - -One piece of Perl code that might make a good testbed is F<installman>. - -=head2 Allocate OPs from arenas - -Currently all new OP structures are individually malloc()ed and free()d. -All C<malloc> implementations have space overheads, and are now as fast as -custom allocates so it would both use less memory and less CPU to allocate -the various OP structures from arenas. The SV arena code can probably be -re-used for this. - -Note that Configuring perl with C<-Accflags=-DPL_OP_SLAB_ALLOC> will use -Perl_Slab_alloc() to pack optrees into a contiguous block, which is -probably superior to the use of OP arenas, esp. from a cache locality -standpoint. See L<Profile Perl - am I hot or not?>. - -=head2 Improve win32/wince.c - -Currently, numerous functions look virtually, if not completely, -identical in both C<win32/wince.c> and C<win32/win32.c> files, which can't -be good. - -=head2 Use secure CRT functions when building with VC8 on Win32 - -Visual C++ 2005 (VC++ 8.x) deprecated a number of CRT functions on the basis -that they were "unsafe" and introduced differently named secure versions of -them as replacements, e.g. instead of writing - - FILE* f = fopen(__FILE__, "r"); - -one should now write - - FILE* f; - errno_t err = fopen_s(&f, __FILE__, "r"); - -Currently, the warnings about these deprecations have been disabled by adding --D_CRT_SECURE_NO_DEPRECATE to the CFLAGS. It would be nice to remove that -warning suppressant and actually make use of the new secure CRT functions. - -There is also a similar issue with POSIX CRT function names like fileno having -been deprecated in favour of ISO C++ conformant names like _fileno. These -warnings are also currently suppressed by adding -D_CRT_NONSTDC_NO_DEPRECATE. It -might be nice to do as Microsoft suggest here too, although, unlike the secure -functions issue, there is presumably little or no benefit in this case. - -=head2 Fix POSIX::access() and chdir() on Win32 - -These functions currently take no account of DACLs and therefore do not behave -correctly in situations where access is restricted by DACLs (as opposed to the -read-only attribute). - -Furthermore, POSIX::access() behaves differently for directories having the -read-only attribute set depending on what CRT library is being used. For -example, the _access() function in the VC6 and VC7 CRTs (wrongly) claim that -such directories are not writable, whereas in fact all directories are writable -unless access is denied by DACLs. (In the case of directories, the read-only -attribute actually only means that the directory cannot be deleted.) This CRT -bug is fixed in the VC8 and VC9 CRTs (but, of course, the directory may still -not actually be writable if access is indeed denied by DACLs). - -For the chdir() issue, see ActiveState bug #74552: -http://bugs.activestate.com/show_bug.cgi?id=74552 - -Therefore, DACLs should be checked both for consistency across CRTs and for -the correct answer. - -(Note that perl's -w operator should not be modified to check DACLs. It has -been written so that it reflects the state of the read-only attribute, even -for directories (whatever CRT is being used), for symmetry with chmod().) - -=head2 strcat(), strcpy(), strncat(), strncpy(), sprintf(), vsprintf() - -Maybe create a utility that checks after each libperl.a creation that -none of the above (nor sprintf(), vsprintf(), or *SHUDDER* gets()) -ever creep back to libperl.a. - - nm libperl.a | ./miniperl -alne '$o = $F[0] if /:$/; print "$o $F[1]" if $F[0] eq "U" && $F[1] =~ /^(?:strn?c(?:at|py)|v?sprintf|gets)$/' - -Note, of course, that this will only tell whether B<your> platform -is using those naughty interfaces. - -=head2 -D_FORTIFY_SOURCE=2, -fstack-protector - -Recent glibcs support C<-D_FORTIFY_SOURCE=2> and recent gcc -(4.1 onwards?) supports C<-fstack-protector>, both of which give -protection against various kinds of buffer overflow problems. -These should probably be used for compiling Perl whenever available, -Configure and/or hints files should be adjusted to probe for the -availability of these features and enable them as appropriate. - -=head2 Arenas for GPs? For MAGIC? - -C<struct gp> and C<struct magic> are both currently allocated by C<malloc>. -It might be a speed or memory saving to change to using arenas. Or it might -not. It would need some suitable benchmarking first. In particular, C<GP>s -can probably be changed with minimal compatibility impact (probably nothing -outside of the core, or even outside of F<gv.c> allocates them), but they -probably aren't allocated/deallocated often enough for a speed saving. Whereas -C<MAGIC> is allocated/deallocated more often, but in turn, is also something -more externally visible, so changing the rules here may bite external code. - -=head2 Shared arenas - -Several SV body structs are now the same size, notably PVMG and PVGV, PVAV and -PVHV, and PVCV and PVFM. It should be possible to allocate and return same -sized bodies from the same actual arena, rather than maintaining one arena for -each. This could save 4-6K per thread, of memory no longer tied up in the -not-yet-allocated part of an arena. - - -=head1 Tasks that need a knowledge of XS - -These tasks would need C knowledge, and roughly the level of knowledge of -the perl API that comes from writing modules that use XS to interface to -C. - -=head2 Write an XS cookbook - -Create pod/perlxscookbook.pod with short, task-focused 'recipes' in XS that -demonstrate common tasks and good practices. (Some of these might be -extracted from perlguts.) The target audience should be XS novices, who need -more examples than perlguts but something less overwhelming than perlapi. -Recipes should provide "one pretty good way to do it" instead of TIMTOWTDI. - -Rather than focusing on interfacing Perl to C libraries, such a cookbook -should probably focus on how to optimize Perl routines by re-writing them -in XS. This will likely be more motivating to those who mostly work in -Perl but are looking to take the next step into XS. - -Deconstructing and explaining some simpler XS modules could be one way to -bootstrap a cookbook. (List::Util? Class::XSAccessor? Tree::Ternary_XS?) -Another option could be deconstructing the implementation of some simpler -functions in op.c. - -=head2 Allow XSUBs to inline themselves as OPs - -For a simple XSUB, often the subroutine dispatch takes more time than the -XSUB itself. The tokeniser already has the ability to inline constant -subroutines - it would be good to provide a way to inline other subroutines. - -Specifically, simplest approach looks to be to allow an XSUB to provide an -alternative implementation of itself as a custom OP. A new flag bit in -C<CvFLAGS()> would signal to the peephole optimiser to take an optree -such as this: - - b <@> leave[1 ref] vKP/REFC ->(end) - 1 <0> enter ->2 - 2 <;> nextstate(main 1 -e:1) v:{ ->3 - a <2> sassign vKS/2 ->b - 8 <1> entersub[t2] sKS/TARG,1 ->9 - - <1> ex-list sK ->8 - 3 <0> pushmark s ->4 - 4 <$> const(IV 1) sM ->5 - 6 <1> rv2av[t1] lKM/1 ->7 - 5 <$> gv(*a) s ->6 - - <1> ex-rv2cv sK ->- - 7 <$> gv(*x) s/EARLYCV ->8 - - <1> ex-rv2sv sKRM*/1 ->a - 9 <$> gvsv(*b) s ->a - -perform the symbol table lookup of C<rv2cv> and C<gv(*x)>, locate the -pointer to the custom OP that provides the direct implementation, and re- -write the optree something like: - - b <@> leave[1 ref] vKP/REFC ->(end) - 1 <0> enter ->2 - 2 <;> nextstate(main 1 -e:1) v:{ ->3 - a <2> sassign vKS/2 ->b - 7 <1> custom_x -> 8 - - <1> ex-list sK ->7 - 3 <0> pushmark s ->4 - 4 <$> const(IV 1) sM ->5 - 6 <1> rv2av[t1] lKM/1 ->7 - 5 <$> gv(*a) s ->6 - - <1> ex-rv2cv sK ->- - - <$> ex-gv(*x) s/EARLYCV ->7 - - <1> ex-rv2sv sKRM*/1 ->a - 8 <$> gvsv(*b) s ->a - -I<i.e.> the C<gv(*)> OP has been nulled and spliced out of the execution -path, and the C<entersub> OP has been replaced by the custom op. - -This approach should provide a measurable speed up to simple XSUBs inside -tight loops. Initially one would have to write the OP alternative -implementation by hand, but it's likely that this should be reasonably -straightforward for the type of XSUB that would benefit the most. Longer -term, once the run-time implementation is proven, it should be possible to -progressively update ExtUtils::ParseXS to generate OP implementations for -some XSUBs. - -=head2 Remove the use of SVs as temporaries in dump.c - -F<dump.c> contains debugging routines to dump out the contains of perl data -structures, such as C<SV>s, C<AV>s and C<HV>s. Currently, the dumping code -B<uses> C<SV>s for its temporary buffers, which was a logical initial -implementation choice, as they provide ready made memory handling. - -However, they also lead to a lot of confusion when it happens that what you're -trying to debug is seen by the code in F<dump.c>, correctly or incorrectly, as -a temporary scalar it can use for a temporary buffer. It's also not possible -to dump scalars before the interpreter is properly set up, such as during -ithreads cloning. It would be good to progressively replace the use of scalars -as string accumulation buffers with something much simpler, directly allocated -by C<malloc>. The F<dump.c> code is (or should be) only producing 7 bit -US-ASCII, so output character sets are not an issue. - -Producing and proving an internal simple buffer allocation would make it easier -to re-write the internals of the PerlIO subsystem to avoid using C<SV>s for -B<its> buffers, use of which can cause problems similar to those of F<dump.c>, -at similar times. - -=head2 safely supporting POSIX SA_SIGINFO - -Some years ago Jarkko supplied patches to provide support for the POSIX -SA_SIGINFO feature in Perl, passing the extra data to the Perl signal handler. - -Unfortunately, it only works with "unsafe" signals, because under safe -signals, by the time Perl gets to run the signal handler, the extra -information has been lost. Moreover, it's not easy to store it somewhere, -as you can't call mutexs, or do anything else fancy, from inside a signal -handler. - -So it strikes me that we could provide safe SA_SIGINFO support - -=over 4 - -=item 1 - -Provide global variables for two file descriptors - -=item 2 - -When the first request is made via C<sigaction> for C<SA_SIGINFO>, create a -pipe, store the reader in one, the writer in the other - -=item 3 - -In the "safe" signal handler (C<Perl_csighandler()>/C<S_raise_signal()>), if -the C<siginfo_t> pointer non-C<NULL>, and the writer file handle is open, - -=over 8 - -=item 1 - -serialise signal number, C<struct siginfo_t> (or at least the parts we care -about) into a small auto char buff - -=item 2 - -C<write()> that (non-blocking) to the writer fd - -=over 12 - -=item 1 - -if it writes 100%, flag the signal in a counter of "signals on the pipe" akin -to the current per-signal-number counts - -=item 2 - -if it writes 0%, assume the pipe is full. Flag the data as lost? - -=item 3 - -if it writes partially, croak a panic, as your OS is broken. - -=back - -=back - -=item 4 - -in the regular C<PERL_ASYNC_CHECK()> processing, if there are "signals on -the pipe", read the data out, deserialise, build the Perl structures on -the stack (code in C<Perl_sighandler()>, the "unsafe" handler), and call as -usual. - -=back - -I think that this gets us decent C<SA_SIGINFO> support, without the current risk -of running Perl code inside the signal handler context. (With all the dangers -of things like C<malloc> corruption that that currently offers us) - -For more information see the thread starting with this message: -http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2008-03/msg00305.html - -=head2 autovivification - -Make all autovivification consistent w.r.t LVALUE/RVALUE and strict/no strict; - -This task is incremental - even a little bit of work on it will help. - -=head2 Unicode in Filenames - -chdir, chmod, chown, chroot, exec, glob, link, lstat, mkdir, open, -opendir, qx, readdir, readlink, rename, rmdir, stat, symlink, sysopen, -system, truncate, unlink, utime, -X. All these could potentially accept -Unicode filenames either as input or output (and in the case of system -and qx Unicode in general, as input or output to/from the shell). -Whether a filesystem - an operating system pair understands Unicode in -filenames varies. - -Known combinations that have some level of understanding include -Microsoft NTFS, Apple HFS+ (In Mac OS 9 and X) and Apple UFS (in Mac -OS X), NFS v4 is rumored to be Unicode, and of course Plan 9. How to -create Unicode filenames, what forms of Unicode are accepted and used -(UCS-2, UTF-16, UTF-8), what (if any) is the normalization form used, -and so on, varies. Finding the right level of interfacing to Perl -requires some thought. Remember that an OS does not implicate a -filesystem. - -(The Windows -C command flag "wide API support" has been at least -temporarily retired in 5.8.1, and the -C has been repurposed, see -L<perlrun>.) - -Most probably the right way to do this would be this: -L</"Virtualize operating system access">. - -=head2 Unicode in %ENV - -Currently the %ENV entries are always byte strings. -See L</"Virtualize operating system access">. - -=head2 Unicode and glob() - -Currently glob patterns and filenames returned from File::Glob::glob() -are always byte strings. See L</"Virtualize operating system access">. - -=head2 use less 'memory' - -Investigate trade offs to switch out perl's choices on memory usage. -Particularly perl should be able to give memory back. - -This task is incremental - even a little bit of work on it will help. - -=head2 Re-implement C<:unique> in a way that is actually thread-safe - -The old implementation made bad assumptions on several levels. A good 90% -solution might be just to make C<:unique> work to share the string buffer -of SvPVs. That way large constant strings can be shared between ithreads, -such as the configuration information in F<Config>. - -=head2 Make tainting consistent - -Tainting would be easier to use if it didn't take documented shortcuts and -allow taint to "leak" everywhere within an expression. - -=head2 readpipe(LIST) - -system() accepts a LIST syntax (and a PROGRAM LIST syntax) to avoid -running a shell. readpipe() (the function behind qx//) could be similarly -extended. - -=head2 Audit the code for destruction ordering assumptions - -Change 25773 notes - - /* Need to check SvMAGICAL, as during global destruction it may be that - AvARYLEN(av) has been freed before av, and hence the SvANY() pointer - is now part of the linked list of SV heads, rather than pointing to - the original body. */ - /* FIXME - audit the code for other bugs like this one. */ - -adding the C<SvMAGICAL> check to - - if (AvARYLEN(av) && SvMAGICAL(AvARYLEN(av))) { - MAGIC *mg = mg_find (AvARYLEN(av), PERL_MAGIC_arylen); - -Go through the core and look for similar assumptions that SVs have particular -types, as all bets are off during global destruction. - -=head2 Extend PerlIO and PerlIO::Scalar - -PerlIO::Scalar doesn't know how to truncate(). Implementing this -would require extending the PerlIO vtable. - -Similarly the PerlIO vtable doesn't know about formats (write()), or -about stat(), or chmod()/chown(), utime(), or flock(). - -(For PerlIO::Scalar it's hard to see what e.g. mode bits or ownership -would mean.) - -PerlIO doesn't do directories or symlinks, either: mkdir(), rmdir(), -opendir(), closedir(), seekdir(), rewinddir(), glob(); symlink(), -readlink(). - -See also L</"Virtualize operating system access">. - -=head2 -C on the #! line - -It should be possible to make -C work correctly if found on the #! line, -given that all perl command line options are strict ASCII, and -C changes -only the interpretation of non-ASCII characters, and not for the script file -handle. To make it work needs some investigation of the ordering of function -calls during startup, and (by implication) a bit of tweaking of that order. - -=head2 Organize error messages - -Perl's diagnostics (error messages, see L<perldiag>) could use -reorganizing and formalizing so that each error message has its -stable-for-all-eternity unique id, categorized by severity, type, and -subsystem. (The error messages would be listed in a datafile outside -of the Perl source code, and the source code would only refer to the -messages by the id.) This clean-up and regularizing should apply -for all croak() messages. - -This would enable all sorts of things: easier translation/localization -of the messages (though please do keep in mind the caveats of -L<Locale::Maketext> about too straightforward approaches to -translation), filtering by severity, and instead of grepping for a -particular error message one could look for a stable error id. (Of -course, changing the error messages by default would break all the -existing software depending on some particular error message...) - -This kind of functionality is known as I<message catalogs>. Look for -inspiration for example in the catgets() system, possibly even use it -if available-- but B<only> if available, all platforms will B<not> -have catgets(). - -For the really pure at heart, consider extending this item to cover -also the warning messages (see L<perllexwarn>, C<warnings.pl>). - -=head1 Tasks that need a knowledge of the interpreter - -These tasks would need C knowledge, and knowledge of how the interpreter works, -or a willingness to learn. - -=head2 forbid labels with keyword names - -Currently C<goto keyword> "computes" the label value: - - $ perl -e 'goto print' - Can't find label 1 at -e line 1. - -It is controversial if the right way to avoid the confusion is to forbid -labels with keyword names, or if it would be better to always treat -bareword expressions after a "goto" as a label and never as a keyword. - -=head2 truncate() prototype - -The prototype of truncate() is currently C<$$>. It should probably -be C<*$> instead. (This is changed in F<opcode.pl>) - -=head2 decapsulation of smart match argument - -Currently C<$foo ~~ $object> will die with the message "Smart matching a -non-overloaded object breaks encapsulation". It would be nice to allow -to bypass this by using explictly the syntax C<$foo ~~ %$object> or -C<$foo ~~ @$object>. - -=head2 error reporting of [$a ; $b] - -Using C<;> inside brackets is a syntax error, and we don't propose to change -that by giving it any meaning. However, it's not reported very helpfully: - - $ perl -e '$a = [$b; $c];' - syntax error at -e line 1, near "$b;" - syntax error at -e line 1, near "$c]" - Execution of -e aborted due to compilation errors. - -It should be possible to hook into the tokeniser or the lexer, so that when a -C<;> is parsed where it is not legal as a statement terminator (ie inside -C<{}> used as a hashref, C<[]> or C<()>) it issues an error something like -I<';' isn't legal inside an expression - if you need multiple statements use a -do {...} block>. See the thread starting at -http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2008-09/msg00573.html - -=head2 lexicals used only once - -This warns: - - $ perl -we '$pie = 42' - Name "main::pie" used only once: possible typo at -e line 1. - -This does not: - - $ perl -we 'my $pie = 42' - -Logically all lexicals used only once should warn, if the user asks for -warnings. An unworked RT ticket (#5087) has been open for almost seven -years for this discrepancy. - -=head2 UTF-8 revamp - -The handling of Unicode is unclean in many places. For example, the regexp -engine matches in Unicode semantics whenever the string or the pattern is -flagged as UTF-8, but that should not be dependent on an internal storage -detail of the string. - -=head2 Properly Unicode safe tokeniser and pads. - -The tokeniser isn't actually very UTF-8 clean. C<use utf8;> is a hack - -variable names are stored in stashes as raw bytes, without the utf-8 flag -set. The pad API only takes a C<char *> pointer, so that's all bytes too. The -tokeniser ignores the UTF-8-ness of C<PL_rsfp>, or any SVs returned from -source filters. All this could be fixed. - -=head2 state variable initialization in list context - -Currently this is illegal: - - state ($a, $b) = foo(); - -In Perl 6, C<state ($a) = foo();> and C<(state $a) = foo();> have different -semantics, which is tricky to implement in Perl 5 as currently they produce -the same opcode trees. The Perl 6 design is firm, so it would be good to -implement the necessary code in Perl 5. There are comments in -C<Perl_newASSIGNOP()> that show the code paths taken by various assignment -constructions involving state variables. - -=head2 Implement $value ~~ 0 .. $range - -It would be nice to extend the syntax of the C<~~> operator to also -understand numeric (and maybe alphanumeric) ranges. - -=head2 A does() built-in - -Like ref(), only useful. It would call the C<DOES> method on objects; it -would also tell whether something can be dereferenced as an -array/hash/etc., or used as a regexp, etc. -L<http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2007-03/msg00481.html> - -=head2 Tied filehandles and write() don't mix - -There is no method on tied filehandles to allow them to be called back by -formats. - -=head2 Propagate compilation hints to the debugger - -Currently a debugger started with -dE on the command-line doesn't see the -features enabled by -E. More generally hints (C<$^H> and C<%^H>) aren't -propagated to the debugger. Probably it would be a good thing to propagate -hints from the innermost non-C<DB::> scope: this would make code eval'ed -in the debugger see the features (and strictures, etc.) currently in -scope. - -=head2 Attach/detach debugger from running program - -The old perltodo notes "With C<gdb>, you can attach the debugger to a running -program if you pass the process ID. It would be good to do this with the Perl -debugger on a running Perl program, although I'm not sure how it would be -done." ssh and screen do this with named pipes in /tmp. Maybe we can too. - -=head2 LVALUE functions for lists - -The old perltodo notes that lvalue functions don't work for list or hash -slices. This would be good to fix. - -=head2 regexp optimiser optional - -The regexp optimiser is not optional. It should configurable to be, to allow -its performance to be measured, and its bugs to be easily demonstrated. - -=head2 C</w> regex modifier - -That flag would enable to match whole words, and also to interpolate -arrays as alternations. With it, C</P/w> would be roughly equivalent to: - - do { local $"='|'; /\b(?:P)\b/ } - -See L<http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2007-01/msg00400.html> -for the discussion. - -=head2 optional optimizer - -Make the peephole optimizer optional. Currently it performs two tasks as -it walks the optree - genuine peephole optimisations, and necessary fixups of -ops. It would be good to find an efficient way to switch out the -optimisations whilst keeping the fixups. - -=head2 You WANT *how* many - -Currently contexts are void, scalar and list. split has a special mechanism in -place to pass in the number of return values wanted. It would be useful to -have a general mechanism for this, backwards compatible and little speed hit. -This would allow proposals such as short circuiting sort to be implemented -as a module on CPAN. - -=head2 lexical aliases - -Allow lexical aliases (maybe via the syntax C<my \$alias = \$foo>. - -=head2 entersub XS vs Perl - -At the moment pp_entersub is huge, and has code to deal with entering both -perl and XS subroutines. Subroutine implementations rarely change between -perl and XS at run time, so investigate using 2 ops to enter subs (one for -XS, one for perl) and swap between if a sub is redefined. - -=head2 Self-ties - -Self-ties are currently illegal because they caused too many segfaults. Maybe -the causes of these could be tracked down and self-ties on all types -reinstated. - -=head2 Optimize away @_ - -The old perltodo notes "Look at the "reification" code in C<av.c>". - -=head2 Virtualize operating system access - -Implement a set of "vtables" that virtualizes operating system access -(open(), mkdir(), unlink(), readdir(), getenv(), etc.) At the very -least these interfaces should take SVs as "name" arguments instead of -bare char pointers; probably the most flexible and extensible way -would be for the Perl-facing interfaces to accept HVs. The system -needs to be per-operating-system and per-file-system -hookable/filterable, preferably both from XS and Perl level -(L<perlport/"Files and Filesystems"> is good reading at this point, -in fact, all of L<perlport> is.) - -This has actually already been implemented (but only for Win32), -take a look at F<iperlsys.h> and F<win32/perlhost.h>. While all Win32 -variants go through a set of "vtables" for operating system access, -non-Win32 systems currently go straight for the POSIX/Unix-style -system/library call. Similar system as for Win32 should be -implemented for all platforms. The existing Win32 implementation -probably does not need to survive alongside this proposed new -implementation, the approaches could be merged. - -What would this give us? One often-asked-for feature this would -enable is using Unicode for filenames, and other "names" like %ENV, -usernames, hostnames, and so forth. -(See L<perlunicode/"When Unicode Does Not Happen">.) - -But this kind of virtualization would also allow for things like -virtual filesystems, virtual networks, and "sandboxes" (though as long -as dynamic loading of random object code is allowed, not very safe -sandboxes since external code of course know not of Perl's vtables). -An example of a smaller "sandbox" is that this feature can be used to -implement per-thread working directories: Win32 already does this. - -See also L</"Extend PerlIO and PerlIO::Scalar">. - -=head2 Investigate PADTMP hash pessimisation - -The peephole optimiser converts constants used for hash key lookups to shared -hash key scalars. Under ithreads, something is undoing this work. -See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2007-09/msg00793.html - -=head2 Store the current pad in the OP slab allocator - -=for clarification -I hope that I got that "current pad" part correct - -Currently we leak ops in various cases of parse failure. I suggested that we -could solve this by always using the op slab allocator, and walking it to -free ops. Dave comments that as some ops are already freed during optree -creation one would have to mark which ops are freed, and not double free them -when walking the slab. He notes that one problem with this is that for some ops -you have to know which pad was current at the time of allocation, which does -change. I suggested storing a pointer to the current pad in the memory allocated -for the slab, and swapping to a new slab each time the pad changes. Dave thinks -that this would work. - -=head2 repack the optree - -Repacking the optree after execution order is determined could allow -removal of NULL ops, and optimal ordering of OPs with respect to cache-line -filling. The slab allocator could be reused for this purpose. I think that -the best way to do this is to make it an optional step just before the -completed optree is attached to anything else, and to use the slab allocator -unchanged, so that freeing ops is identical whether or not this step runs. -Note that the slab allocator allocates ops downwards in memory, so one would -have to actually "allocate" the ops in reverse-execution order to get them -contiguous in memory in execution order. - -See http://www.nntp.perl.org/group/perl.perl5.porters/2007/12/msg131975.html - -Note that running this copy, and then freeing all the old location ops would -cause their slabs to be freed, which would eliminate possible memory wastage if -the previous suggestion is implemented, and we swap slabs more frequently. - -=head2 eliminate incorrect line numbers in warnings - -This code - - use warnings; - my $undef; - - if ($undef == 3) { - } elsif ($undef == 0) { - } - -used to produce this output: - - Use of uninitialized value in numeric eq (==) at wrong.pl line 4. - Use of uninitialized value in numeric eq (==) at wrong.pl line 4. - -where the line of the second warning was misreported - it should be line 5. -Rafael fixed this - the problem arose because there was no nextstate OP -between the execution of the C<if> and the C<elsif>, hence C<PL_curcop> still -reports that the currently executing line is line 4. The solution was to inject -a nextstate OPs for each C<elsif>, although it turned out that the nextstate -OP needed to be a nulled OP, rather than a live nextstate OP, else other line -numbers became misreported. (Jenga!) - -The problem is more general than C<elsif> (although the C<elsif> case is the -most common and the most confusing). Ideally this code - - use warnings; - my $undef; - - my $a = $undef + 1; - my $b - = $undef - + 1; - -would produce this output - - Use of uninitialized value $undef in addition (+) at wrong.pl line 4. - Use of uninitialized value $undef in addition (+) at wrong.pl line 7. - -(rather than lines 4 and 5), but this would seem to require every OP to carry -(at least) line number information. - -What might work is to have an optional line number in memory just before the -BASEOP structure, with a flag bit in the op to say whether it's present. -Initially during compile every OP would carry its line number. Then add a late -pass to the optimiser (potentially combined with L</repack the optree>) which -looks at the two ops on every edge of the graph of the execution path. If -the line number changes, flags the destination OP with this information. -Once all paths are traced, replace every op with the flag with a -nextstate-light op (that just updates C<PL_curcop>), which in turn then passes -control on to the true op. All ops would then be replaced by variants that -do not store the line number. (Which, logically, why it would work best in -conjunction with L</repack the optree>, as that is already copying/reallocating -all the OPs) - -(Although I should note that we're not certain that doing this for the general -case is worth it) - -=head2 optimize tail-calls - -Tail-calls present an opportunity for broadly applicable optimization; -anywhere that C<< return foo(...) >> is called, the outer return can -be replaced by a goto, and foo will return directly to the outer -caller, saving (conservatively) 25% of perl's call&return cost, which -is relatively higher than in C. The scheme language is known to do -this heavily. B::Concise provides good insight into where this -optimization is possible, ie anywhere entersub,leavesub op-sequence -occurs. - - perl -MO=Concise,-exec,a,b,-main -e 'sub a{ 1 }; sub b {a()}; b(2)' - -Bottom line on this is probably a new pp_tailcall function which -combines the code in pp_entersub, pp_leavesub. This should probably -be done 1st in XS, and using B::Generate to patch the new OP into the -optrees. - -=head1 Big projects - -Tasks that will get your name mentioned in the description of the "Highlights -of 5.12" - -=head2 make ithreads more robust - -Generally make ithreads more robust. See also L</iCOW> - -This task is incremental - even a little bit of work on it will help, and -will be greatly appreciated. - -One bit would be to write the missing code in sv.c:Perl_dirp_dup. - -Fix Perl_sv_dup, et al so that threads can return objects. - -=head2 iCOW - -Sarathy and Arthur have a proposal for an improved Copy On Write which -specifically will be able to COW new ithreads. If this can be implemented -it would be a good thing. - -=head2 (?{...}) closures in regexps - -Fix (or rewrite) the implementation of the C</(?{...})/> closures. - -=head2 A re-entrant regexp engine - -This will allow the use of a regex from inside (?{ }), (??{ }) and -(?(?{ })|) constructs. - -=head2 Add class set operations to regexp engine - -Apparently these are quite useful. Anyway, Jeffery Friedl wants them. - -demerphq has this on his todo list, but right at the bottom. - - -=head1 Tasks for microperl - - -[ Each and every one of these may be obsolete, but they were listed - in the old Todo.micro file] - - -=head2 make creating uconfig.sh automatic - -=head2 make creating Makefile.micro automatic - -=head2 do away with fork/exec/wait? - -(system, popen should be enough?) - -=head2 some of the uconfig.sh really needs to be probed (using cc) in buildtime: - -(uConfigure? :-) native datatype widths and endianness come to mind - +We no longer install the Perl 5 to-do list as a manpage, as installing +snapshot that becomes increasingly out of date isn't that useful to anyone. +The current Perl 5 to-do list is maintained in the git repository, and can +be viewed at L<http://perl5.git.perl.org/perl.git/blob/HEAD:/Porting/todo.pod> diff --git a/gnu/usr.bin/perl/pod/perltooc.pod b/gnu/usr.bin/perl/pod/perltooc.pod index 06f697cdef1..35163255300 100644 --- a/gnu/usr.bin/perl/pod/perltooc.pod +++ b/gnu/usr.bin/perl/pod/perltooc.pod @@ -1,1342 +1,12 @@ +=encoding utf8 + =head1 NAME -perltooc - Tom's OO Tutorial for Class Data in Perl +perltooc - This document has been deleted =head1 DESCRIPTION -When designing an object class, you are sometimes faced with the situation -of wanting common state shared by all objects of that class. -Such I<class attributes> act somewhat like global variables for the entire -class, but unlike program-wide globals, class attributes have meaning only to -the class itself. - -Here are a few examples where class attributes might come in handy: - -=over 4 - -=item * - -to keep a count of the objects you've created, or how many are -still extant. - -=item * - -to extract the name or file descriptor for a logfile used by a debugging -method. - -=item * - -to access collective data, like the total amount of cash dispensed by -all ATMs in a network in a given day. - -=item * - -to access the last object created by a class, or the most accessed object, -or to retrieve a list of all objects. - -=back - -Unlike a true global, class attributes should not be accessed directly. -Instead, their state should be inspected, and perhaps altered, only -through the mediated access of I<class methods>. These class attributes -accessor methods are similar in spirit and function to accessors used -to manipulate the state of instance attributes on an object. They provide a -clear firewall between interface and implementation. - -You should allow access to class attributes through either the class -name or any object of that class. If we assume that $an_object is of -type Some_Class, and the &Some_Class::population_count method accesses -class attributes, then these two invocations should both be possible, -and almost certainly equivalent. - - Some_Class->population_count() - $an_object->population_count() - -The question is, where do you store the state which that method accesses? -Unlike more restrictive languages like C++, where these are called -static data members, Perl provides no syntactic mechanism to declare -class attributes, any more than it provides a syntactic mechanism to -declare instance attributes. Perl provides the developer with a broad -set of powerful but flexible features that can be uniquely crafted to -the particular demands of the situation. - -A class in Perl is typically implemented in a module. A module consists -of two complementary feature sets: a package for interfacing with the -outside world, and a lexical file scope for privacy. Either of these -two mechanisms can be used to implement class attributes. That means you -get to decide whether to put your class attributes in package variables -or to put them in lexical variables. - -And those aren't the only decisions to make. If you choose to use package -variables, you can make your class attribute accessor methods either ignorant -of inheritance or sensitive to it. If you choose lexical variables, -you can elect to permit access to them from anywhere in the entire file -scope, or you can limit direct data access exclusively to the methods -implementing those attributes. - -=head1 Class Data in a Can - -One of the easiest ways to solve a hard problem is to let someone else -do it for you! In this case, Class::Data::Inheritable (available on a -CPAN near you) offers a canned solution to the class data problem -using closures. So before you wade into this document, consider -having a look at that module. - - -=head1 Class Data as Package Variables - -Because a class in Perl is really just a package, using package variables -to hold class attributes is the most natural choice. This makes it simple -for each class to have its own class attributes. Let's say you have a class -called Some_Class that needs a couple of different attributes that you'd -like to be global to the entire class. The simplest thing to do is to -use package variables like $Some_Class::CData1 and $Some_Class::CData2 -to hold these attributes. But we certainly don't want to encourage -outsiders to touch those data directly, so we provide methods -to mediate access. - -In the accessor methods below, we'll for now just ignore the first -argument--that part to the left of the arrow on method invocation, which -is either a class name or an object reference. - - package Some_Class; - sub CData1 { - shift; # XXX: ignore calling class/object - $Some_Class::CData1 = shift if @_; - return $Some_Class::CData1; - } - sub CData2 { - shift; # XXX: ignore calling class/object - $Some_Class::CData2 = shift if @_; - return $Some_Class::CData2; - } - -This technique is highly legible and should be completely straightforward -to even the novice Perl programmer. By fully qualifying the package -variables, they stand out clearly when reading the code. Unfortunately, -if you misspell one of these, you've introduced an error that's hard -to catch. It's also somewhat disconcerting to see the class name itself -hard-coded in so many places. - -Both these problems can be easily fixed. Just add the C<use strict> -pragma, then pre-declare your package variables. (The C<our> operator -will be new in 5.6, and will work for package globals just like C<my> -works for scoped lexicals.) - - package Some_Class; - use strict; - our($CData1, $CData2); # our() is new to perl5.6 - sub CData1 { - shift; # XXX: ignore calling class/object - $CData1 = shift if @_; - return $CData1; - } - sub CData2 { - shift; # XXX: ignore calling class/object - $CData2 = shift if @_; - return $CData2; - } - - -As with any other global variable, some programmers prefer to start their -package variables with capital letters. This helps clarity somewhat, but -by no longer fully qualifying the package variables, their significance -can be lost when reading the code. You can fix this easily enough by -choosing better names than were used here. - -=head2 Putting All Your Eggs in One Basket - -Just as the mindless enumeration of accessor methods for instance attributes -grows tedious after the first few (see L<perltoot>), so too does the -repetition begin to grate when listing out accessor methods for class -data. Repetition runs counter to the primary virtue of a programmer: -Laziness, here manifesting as that innate urge every programmer feels -to factor out duplicate code whenever possible. - -Here's what to do. First, make just one hash to hold all class attributes. - - package Some_Class; - use strict; - our %ClassData = ( # our() is new to perl5.6 - CData1 => "", - CData2 => "", - ); - -Using closures (see L<perlref>) and direct access to the package symbol -table (see L<perlmod>), now clone an accessor method for each key in -the %ClassData hash. Each of these methods is used to fetch or store -values to the specific, named class attribute. - - for my $datum (keys %ClassData) { - no strict "refs"; # to register new methods in package - *$datum = sub { - shift; # XXX: ignore calling class/object - $ClassData{$datum} = shift if @_; - return $ClassData{$datum}; - } - } - -It's true that you could work out a solution employing an &AUTOLOAD -method, but this approach is unlikely to prove satisfactory. Your -function would have to distinguish between class attributes and object -attributes; it could interfere with inheritance; and it would have to -careful about DESTROY. Such complexity is uncalled for in most cases, -and certainly in this one. - -You may wonder why we're rescinding strict refs for the loop. We're -manipulating the package's symbol table to introduce new function names -using symbolic references (indirect naming), which the strict pragma -would otherwise forbid. Normally, symbolic references are a dodgy -notion at best. This isn't just because they can be used accidentally -when you aren't meaning to. It's also because for most uses -to which beginning Perl programmers attempt to put symbolic references, -we have much better approaches, like nested hashes or hashes of arrays. -But there's nothing wrong with using symbolic references to manipulate -something that is meaningful only from the perspective of the package -symbol table, like method names or package variables. In other -words, when you want to refer to the symbol table, use symbol references. - -Clustering all the class attributes in one place has several advantages. -They're easy to spot, initialize, and change. The aggregation also -makes them convenient to access externally, such as from a debugger -or a persistence package. The only possible problem is that we don't -automatically know the name of each class's class object, should it have -one. This issue is addressed below in L<"The Eponymous Meta-Object">. - -=head2 Inheritance Concerns - -Suppose you have an instance of a derived class, and you access class -data using an inherited method call. Should that end up referring -to the base class's attributes, or to those in the derived class? -How would it work in the earlier examples? The derived class inherits -all the base class's methods, including those that access class attributes. -But what package are the class attributes stored in? - -The answer is that, as written, class attributes are stored in the package into -which those methods were compiled. When you invoke the &CData1 method -on the name of the derived class or on one of that class's objects, the -version shown above is still run, so you'll access $Some_Class::CData1--or -in the method cloning version, C<$Some_Class::ClassData{CData1}>. - -Think of these class methods as executing in the context of their base -class, not in that of their derived class. Sometimes this is exactly -what you want. If Feline subclasses Carnivore, then the population of -Carnivores in the world should go up when a new Feline is born. -But what if you wanted to figure out how many Felines you have apart -from Carnivores? The current approach doesn't support that. - -You'll have to decide on a case-by-case basis whether it makes any sense -for class attributes to be package-relative. If you want it to be so, -then stop ignoring the first argument to the function. Either it will -be a package name if the method was invoked directly on a class name, -or else it will be an object reference if the method was invoked on an -object reference. In the latter case, the ref() function provides the -class of that object. - - package Some_Class; - sub CData1 { - my $obclass = shift; - my $class = ref($obclass) || $obclass; - my $varname = $class . "::CData1"; - no strict "refs"; # to access package data symbolically - $$varname = shift if @_; - return $$varname; - } - -And then do likewise for all other class attributes (such as CData2, -etc.) that you wish to access as package variables in the invoking package -instead of the compiling package as we had previously. - -Once again we temporarily disable the strict references ban, because -otherwise we couldn't use the fully-qualified symbolic name for -the package global. This is perfectly reasonable: since all package -variables by definition live in a package, there's nothing wrong with -accessing them via that package's symbol table. That's what it's there -for (well, somewhat). - -What about just using a single hash for everything and then cloning -methods? What would that look like? The only difference would be the -closure used to produce new method entries for the class's symbol table. - - no strict "refs"; - *$datum = sub { - my $obclass = shift; - my $class = ref($obclass) || $obclass; - my $varname = $class . "::ClassData"; - $varname->{$datum} = shift if @_; - return $varname->{$datum}; - } - -=head2 The Eponymous Meta-Object - -It could be argued that the %ClassData hash in the previous example is -neither the most imaginative nor the most intuitive of names. Is there -something else that might make more sense, be more useful, or both? - -As it happens, yes, there is. For the "class meta-object", we'll use -a package variable of the same name as the package itself. Within the -scope of a package Some_Class declaration, we'll use the eponymously -named hash %Some_Class as that class's meta-object. (Using an eponymously -named hash is somewhat reminiscent of classes that name their constructors -eponymously in the Python or C++ fashion. That is, class Some_Class would -use &Some_Class::Some_Class as a constructor, probably even exporting that -name as well. The StrNum class in Recipe 13.14 in I<The Perl Cookbook> -does this, if you're looking for an example.) - -This predictable approach has many benefits, including having a well-known -identifier to aid in debugging, transparent persistence, -or checkpointing. It's also the obvious name for monadic classes and -translucent attributes, discussed later. - -Here's an example of such a class. Notice how the name of the -hash storing the meta-object is the same as the name of the package -used to implement the class. - - package Some_Class; - use strict; - - # create class meta-object using that most perfect of names - our %Some_Class = ( # our() is new to perl5.6 - CData1 => "", - CData2 => "", - ); - - # this accessor is calling-package-relative - sub CData1 { - my $obclass = shift; - my $class = ref($obclass) || $obclass; - no strict "refs"; # to access eponymous meta-object - $class->{CData1} = shift if @_; - return $class->{CData1}; - } - - # but this accessor is not - sub CData2 { - shift; # XXX: ignore calling class/object - no strict "refs"; # to access eponymous meta-object - __PACKAGE__ -> {CData2} = shift if @_; - return __PACKAGE__ -> {CData2}; - } - -In the second accessor method, the __PACKAGE__ notation was used for -two reasons. First, to avoid hardcoding the literal package name -in the code in case we later want to change that name. Second, to -clarify to the reader that what matters here is the package currently -being compiled into, not the package of the invoking object or class. -If the long sequence of non-alphabetic characters bothers you, you can -always put the __PACKAGE__ in a variable first. - - sub CData2 { - shift; # XXX: ignore calling class/object - no strict "refs"; # to access eponymous meta-object - my $class = __PACKAGE__; - $class->{CData2} = shift if @_; - return $class->{CData2}; - } - -Even though we're using symbolic references for good not evil, some -folks tend to become unnerved when they see so many places with strict -ref checking disabled. Given a symbolic reference, you can always -produce a real reference (the reverse is not true, though). So we'll -create a subroutine that does this conversion for us. If invoked as a -function of no arguments, it returns a reference to the compiling class's -eponymous hash. Invoked as a class method, it returns a reference to -the eponymous hash of its caller. And when invoked as an object method, -this function returns a reference to the eponymous hash for whatever -class the object belongs to. - - package Some_Class; - use strict; - - our %Some_Class = ( # our() is new to perl5.6 - CData1 => "", - CData2 => "", - ); - - # tri-natured: function, class method, or object method - sub _classobj { - my $obclass = shift || __PACKAGE__; - my $class = ref($obclass) || $obclass; - no strict "refs"; # to convert sym ref to real one - return \%$class; - } - - for my $datum (keys %{ _classobj() } ) { - # turn off strict refs so that we can - # register a method in the symbol table - no strict "refs"; - *$datum = sub { - use strict "refs"; - my $self = shift->_classobj(); - $self->{$datum} = shift if @_; - return $self->{$datum}; - } - } - -=head2 Indirect References to Class Data - -A reasonably common strategy for handling class attributes is to store -a reference to each package variable on the object itself. This is -a strategy you've probably seen before, such as in L<perltoot> and -L<perlbot>, but there may be variations in the example below that you -haven't thought of before. - - package Some_Class; - our($CData1, $CData2); # our() is new to perl5.6 - - sub new { - my $obclass = shift; - return bless my $self = { - ObData1 => "", - ObData2 => "", - CData1 => \$CData1, - CData2 => \$CData2, - } => (ref $obclass || $obclass); - } - - sub ObData1 { - my $self = shift; - $self->{ObData1} = shift if @_; - return $self->{ObData1}; - } - - sub ObData2 { - my $self = shift; - $self->{ObData2} = shift if @_; - return $self->{ObData2}; - } - - sub CData1 { - my $self = shift; - my $dataref = ref $self - ? $self->{CData1} - : \$CData1; - $$dataref = shift if @_; - return $$dataref; - } - - sub CData2 { - my $self = shift; - my $dataref = ref $self - ? $self->{CData2} - : \$CData2; - $$dataref = shift if @_; - return $$dataref; - } - -As written above, a derived class will inherit these methods, which -will consequently access package variables in the base class's package. -This is not necessarily expected behavior in all circumstances. Here's an -example that uses a variable meta-object, taking care to access the -proper package's data. - - package Some_Class; - use strict; - - our %Some_Class = ( # our() is new to perl5.6 - CData1 => "", - CData2 => "", - ); - - sub _classobj { - my $self = shift; - my $class = ref($self) || $self; - no strict "refs"; - # get (hard) ref to eponymous meta-object - return \%$class; - } - - sub new { - my $obclass = shift; - my $classobj = $obclass->_classobj(); - bless my $self = { - ObData1 => "", - ObData2 => "", - CData1 => \$classobj->{CData1}, - CData2 => \$classobj->{CData2}, - } => (ref $obclass || $obclass); - return $self; - } - - sub ObData1 { - my $self = shift; - $self->{ObData1} = shift if @_; - return $self->{ObData1}; - } - - sub ObData2 { - my $self = shift; - $self->{ObData2} = shift if @_; - return $self->{ObData2}; - } - - sub CData1 { - my $self = shift; - $self = $self->_classobj() unless ref $self; - my $dataref = $self->{CData1}; - $$dataref = shift if @_; - return $$dataref; - } - - sub CData2 { - my $self = shift; - $self = $self->_classobj() unless ref $self; - my $dataref = $self->{CData2}; - $$dataref = shift if @_; - return $$dataref; - } - -Not only are we now strict refs clean, using an eponymous meta-object -seems to make the code cleaner. Unlike the previous version, this one -does something interesting in the face of inheritance: it accesses the -class meta-object in the invoking class instead of the one into which -the method was initially compiled. - -You can easily access data in the class meta-object, making -it easy to dump the complete class state using an external mechanism such -as when debugging or implementing a persistent class. This works because -the class meta-object is a package variable, has a well-known name, and -clusters all its data together. (Transparent persistence -is not always feasible, but it's certainly an appealing idea.) - -There's still no check that object accessor methods have not been -invoked on a class name. If strict ref checking is enabled, you'd -blow up. If not, then you get the eponymous meta-object. What you do -with--or about--this is up to you. The next two sections demonstrate -innovative uses for this powerful feature. - -=head2 Monadic Classes - -Some of the standard modules shipped with Perl provide class interfaces -without any attribute methods whatsoever. The most commonly used module -not numbered amongst the pragmata, the Exporter module, is a class with -neither constructors nor attributes. Its job is simply to provide a -standard interface for modules wishing to export part of their namespace -into that of their caller. Modules use the Exporter's &import method by -setting their inheritance list in their package's @ISA array to mention -"Exporter". But class Exporter provides no constructor, so you can't -have several instances of the class. In fact, you can't have any--it -just doesn't make any sense. All you get is its methods. Its interface -contains no statefulness, so state data is wholly superfluous. - -Another sort of class that pops up from time to time is one that supports -a unique instance. Such classes are called I<monadic classes>, or less -formally, I<singletons> or I<highlander classes>. - -If a class is monadic, where do you store its state, that is, -its attributes? How do you make sure that there's never more than -one instance? While you could merely use a slew of package variables, -it's a lot cleaner to use the eponymously named hash. Here's a complete -example of a monadic class: - - package Cosmos; - %Cosmos = (); - - # accessor method for "name" attribute - sub name { - my $self = shift; - $self->{name} = shift if @_; - return $self->{name}; - } - - # read-only accessor method for "birthday" attribute - sub birthday { - my $self = shift; - die "can't reset birthday" if @_; # XXX: croak() is better - return $self->{birthday}; - } - - # accessor method for "stars" attribute - sub stars { - my $self = shift; - $self->{stars} = shift if @_; - return $self->{stars}; - } - - # oh my - one of our stars just went out! - sub supernova { - my $self = shift; - my $count = $self->stars(); - $self->stars($count - 1) if $count > 0; - } - - # constructor/initializer method - fix by reboot - sub bigbang { - my $self = shift; - %$self = ( - name => "the world according to tchrist", - birthday => time(), - stars => 0, - ); - return $self; # yes, it's probably a class. SURPRISE! - } - - # After the class is compiled, but before any use or require - # returns, we start off the universe with a bang. - __PACKAGE__ -> bigbang(); - -Hold on, that doesn't look like anything special. Those attribute -accessors look no different than they would if this were a regular class -instead of a monadic one. The crux of the matter is there's nothing -that says that $self must hold a reference to a blessed object. It merely -has to be something you can invoke methods on. Here the package name -itself, Cosmos, works as an object. Look at the &supernova method. Is that -a class method or an object method? The answer is that static analysis -cannot reveal the answer. Perl doesn't care, and neither should you. -In the three attribute methods, C<%$self> is really accessing the %Cosmos -package variable. - -If like Stephen Hawking, you posit the existence of multiple, sequential, -and unrelated universes, then you can invoke the &bigbang method yourself -at any time to start everything all over again. You might think of -&bigbang as more of an initializer than a constructor, since the function -doesn't allocate new memory; it only initializes what's already there. -But like any other constructor, it does return a scalar value to use -for later method invocations. - -Imagine that some day in the future, you decide that one universe just -isn't enough. You could write a new class from scratch, but you already -have an existing class that does what you want--except that it's monadic, -and you want more than just one cosmos. - -That's what code reuse via subclassing is all about. Look how short -the new code is: - - package Multiverse; - use Cosmos; - @ISA = qw(Cosmos); - - sub new { - my $protoverse = shift; - my $class = ref($protoverse) || $protoverse; - my $self = {}; - return bless($self, $class)->bigbang(); - } - 1; - -Because we were careful to be good little creators when we designed our -Cosmos class, we can now reuse it without touching a single line of code -when it comes time to write our Multiverse class. The same code that -worked when invoked as a class method continues to work perfectly well -when invoked against separate instances of a derived class. - -The astonishing thing about the Cosmos class above is that the value -returned by the &bigbang "constructor" is not a reference to a blessed -object at all. It's just the class's own name. A class name is, for -virtually all intents and purposes, a perfectly acceptable object. -It has state, behavior, and identity, the three crucial components -of an object system. It even manifests inheritance, polymorphism, -and encapsulation. And what more can you ask of an object? - -To understand object orientation in Perl, it's important to recognize the -unification of what other programming languages might think of as class -methods and object methods into just plain methods. "Class methods" -and "object methods" are distinct only in the compartmentalizing mind -of the Perl programmer, not in the Perl language itself. - -Along those same lines, a constructor is nothing special either, which -is one reason why Perl has no pre-ordained name for them. "Constructor" -is just an informal term loosely used to describe a method that returns -a scalar value that you can make further method calls against. So long -as it's either a class name or an object reference, that's good enough. -It doesn't even have to be a reference to a brand new object. - -You can have as many--or as few--constructors as you want, and you can -name them whatever you care to. Blindly and obediently using new() -for each and every constructor you ever write is to speak Perl with -such a severe C++ accent that you do a disservice to both languages. -There's no reason to insist that each class have but one constructor, -or that a constructor be named new(), or that a constructor be -used solely as a class method and not an object method. - -The next section shows how useful it can be to further distance ourselves -from any formal distinction between class method calls and object method -calls, both in constructors and in accessor methods. - -=head2 Translucent Attributes - -A package's eponymous hash can be used for more than just containing -per-class, global state data. It can also serve as a sort of template -containing default settings for object attributes. These default -settings can then be used in constructors for initialization of a -particular object. The class's eponymous hash can also be used to -implement I<translucent attributes>. A translucent attribute is one -that has a class-wide default. Each object can set its own value for the -attribute, in which case C<< $object->attribute() >> returns that value. -But if no value has been set, then C<< $object->attribute() >> returns -the class-wide default. - -We'll apply something of a copy-on-write approach to these translucent -attributes. If you're just fetching values from them, you get -translucency. But if you store a new value to them, that new value is -set on the current object. On the other hand, if you use the class as -an object and store the attribute value directly on the class, then the -meta-object's value changes, and later fetch operations on objects with -uninitialized values for those attributes will retrieve the meta-object's -new values. Objects with their own initialized values, however, won't -see any change. - -Let's look at some concrete examples of using these properties before we -show how to implement them. Suppose that a class named Some_Class -had a translucent data attribute called "color". First you set the color -in the meta-object, then you create three objects using a constructor -that happens to be named &spawn. - - use Vermin; - Vermin->color("vermilion"); - - $ob1 = Vermin->spawn(); # so that's where Jedi come from - $ob2 = Vermin->spawn(); - $ob3 = Vermin->spawn(); - - print $obj3->color(); # prints "vermilion" - -Each of these objects' colors is now "vermilion", because that's the -meta-object's value for that attribute, and these objects do not have -individual color values set. - -Changing the attribute on one object has no effect on other objects -previously created. - - $ob3->color("chartreuse"); - print $ob3->color(); # prints "chartreuse" - print $ob1->color(); # prints "vermilion", translucently - -If you now use $ob3 to spawn off another object, the new object will -take the color its parent held, which now happens to be "chartreuse". -That's because the constructor uses the invoking object as its template -for initializing attributes. When that invoking object is the -class name, the object used as a template is the eponymous meta-object. -When the invoking object is a reference to an instantiated object, the -&spawn constructor uses that existing object as a template. - - $ob4 = $ob3->spawn(); # $ob3 now template, not %Vermin - print $ob4->color(); # prints "chartreuse" - -Any actual values set on the template object will be copied to the -new object. But attributes undefined in the template object, being -translucent, will remain undefined and consequently translucent in the -new one as well. - -Now let's change the color attribute on the entire class: - - Vermin->color("azure"); - print $ob1->color(); # prints "azure" - print $ob2->color(); # prints "azure" - print $ob3->color(); # prints "chartreuse" - print $ob4->color(); # prints "chartreuse" - -That color change took effect only in the first pair of objects, which -were still translucently accessing the meta-object's values. The second -pair had per-object initialized colors, and so didn't change. - -One important question remains. Changes to the meta-object are reflected -in translucent attributes in the entire class, but what about -changes to discrete objects? If you change the color of $ob3, does the -value of $ob4 see that change? Or vice-versa. If you change the color -of $ob4, does then the value of $ob3 shift? - - $ob3->color("amethyst"); - print $ob3->color(); # prints "amethyst" - print $ob4->color(); # hmm: "chartreuse" or "amethyst"? - -While one could argue that in certain rare cases it should, let's not -do that. Good taste aside, we want the answer to the question posed in -the comment above to be "chartreuse", not "amethyst". So we'll treat -these attributes similar to the way process attributes like environment -variables, user and group IDs, or the current working directory are -treated across a fork(). You can change only yourself, but you will see -those changes reflected in your unspawned children. Changes to one object -will propagate neither up to the parent nor down to any existing child objects. -Those objects made later, however, will see the changes. - -If you have an object with an actual attribute value, and you want to -make that object's attribute value translucent again, what do you do? -Let's design the class so that when you invoke an accessor method with -C<undef> as its argument, that attribute returns to translucency. - - $ob4->color(undef); # back to "azure" - -Here's a complete implementation of Vermin as described above. - - package Vermin; - - # here's the class meta-object, eponymously named. - # it holds all class attributes, and also all instance attributes - # so the latter can be used for both initialization - # and translucency. - - our %Vermin = ( # our() is new to perl5.6 - PopCount => 0, # capital for class attributes - color => "beige", # small for instance attributes - ); - - # constructor method - # invoked as class method or object method - sub spawn { - my $obclass = shift; - my $class = ref($obclass) || $obclass; - my $self = {}; - bless($self, $class); - $class->{PopCount}++; - # init fields from invoking object, or omit if - # invoking object is the class to provide translucency - %$self = %$obclass if ref $obclass; - return $self; - } - - # translucent accessor for "color" attribute - # invoked as class method or object method - sub color { - my $self = shift; - my $class = ref($self) || $self; - - # handle class invocation - unless (ref $self) { - $class->{color} = shift if @_; - return $class->{color} - } - - # handle object invocation - $self->{color} = shift if @_; - if (defined $self->{color}) { # not exists! - return $self->{color}; - } else { - return $class->{color}; - } - } - - # accessor for "PopCount" class attribute - # invoked as class method or object method - # but uses object solely to locate meta-object - sub population { - my $obclass = shift; - my $class = ref($obclass) || $obclass; - return $class->{PopCount}; - } - - # instance destructor - # invoked only as object method - sub DESTROY { - my $self = shift; - my $class = ref $self; - $class->{PopCount}--; - } - -Here are a couple of helper methods that might be convenient. They aren't -accessor methods at all. They're used to detect accessibility of data -attributes. The &is_translucent method determines whether a particular -object attribute is coming from the meta-object. The &has_attribute -method detects whether a class implements a particular property at all. -It could also be used to distinguish undefined properties from non-existent -ones. - - # detect whether an object attribute is translucent - # (typically?) invoked only as object method - sub is_translucent { - my($self, $attr) = @_; - return !defined $self->{$attr}; - } - - # test for presence of attribute in class - # invoked as class method or object method - sub has_attribute { - my($self, $attr) = @_; - my $class = ref($self) || $self; - return exists $class->{$attr}; - } - -If you prefer to install your accessors more generically, you can make -use of the upper-case versus lower-case convention to register into the -package appropriate methods cloned from generic closures. - - for my $datum (keys %{ +__PACKAGE__ }) { - *$datum = ($datum =~ /^[A-Z]/) - ? sub { # install class accessor - my $obclass = shift; - my $class = ref($obclass) || $obclass; - return $class->{$datum}; - } - : sub { # install translucent accessor - my $self = shift; - my $class = ref($self) || $self; - unless (ref $self) { - $class->{$datum} = shift if @_; - return $class->{$datum} - } - $self->{$datum} = shift if @_; - return defined $self->{$datum} - ? $self -> {$datum} - : $class -> {$datum} - } - } - -Translations of this closure-based approach into C++, Java, and Python -have been left as exercises for the reader. Be sure to send us mail as -soon as you're done. - -=head1 Class Data as Lexical Variables - -=head2 Privacy and Responsibility - -Unlike conventions used by some Perl programmers, in the previous -examples, we didn't prefix the package variables used for class attributes -with an underscore, nor did we do so for the names of the hash keys used -for instance attributes. You don't need little markers on data names to -suggest nominal privacy on attribute variables or hash keys, because these -are B<already> notionally private! Outsiders have no business whatsoever -playing with anything within a class save through the mediated access of -its documented interface; in other words, through method invocations. -And not even through just any method, either. Methods that begin with -an underscore are traditionally considered off-limits outside the class. -If outsiders skip the documented method interface to poke around the -internals of your class and end up breaking something, that's not your -fault--it's theirs. - -Perl believes in individual responsibility rather than mandated control. -Perl respects you enough to let you choose your own preferred level of -pain, or of pleasure. Perl believes that you are creative, intelligent, -and capable of making your own decisions--and fully expects you to -take complete responsibility for your own actions. In a perfect world, -these admonitions alone would suffice, and everyone would be intelligent, -responsible, happy, and creative. And careful. One probably shouldn't -forget careful, and that's a good bit harder to expect. Even Einstein -would take wrong turns by accident and end up lost in the wrong part -of town. - -Some folks get the heebie-jeebies when they see package variables -hanging out there for anyone to reach over and alter them. Some folks -live in constant fear that someone somewhere might do something wicked. -The solution to that problem is simply to fire the wicked, of course. -But unfortunately, it's not as simple as all that. These cautious -types are also afraid that they or others will do something not so -much wicked as careless, whether by accident or out of desperation. -If we fire everyone who ever gets careless, pretty soon there won't be -anybody left to get any work done. - -Whether it's needless paranoia or sensible caution, this uneasiness can -be a problem for some people. We can take the edge off their discomfort -by providing the option of storing class attributes as lexical variables -instead of as package variables. The my() operator is the source of -all privacy in Perl, and it is a powerful form of privacy indeed. - -It is widely perceived, and indeed has often been written, that Perl -provides no data hiding, that it affords the class designer no privacy -nor isolation, merely a rag-tag assortment of weak and unenforceable -social conventions instead. This perception is demonstrably false and -easily disproven. In the next section, we show how to implement forms -of privacy that are far stronger than those provided in nearly any -other object-oriented language. - -=head2 File-Scoped Lexicals - -A lexical variable is visible only through the end of its static scope. -That means that the only code able to access that variable is code -residing textually below the my() operator through the end of its block -if it has one, or through the end of the current file if it doesn't. - -Starting again with our simplest example given at the start of this -document, we replace our() variables with my() versions. - - package Some_Class; - my($CData1, $CData2); # file scope, not in any package - sub CData1 { - shift; # XXX: ignore calling class/object - $CData1 = shift if @_; - return $CData1; - } - sub CData2 { - shift; # XXX: ignore calling class/object - $CData2 = shift if @_; - return $CData2; - } - -So much for that old $Some_Class::CData1 package variable and its brethren! -Those are gone now, replaced with lexicals. No one outside the -scope can reach in and alter the class state without resorting to the -documented interface. Not even subclasses or superclasses of -this one have unmediated access to $CData1. They have to invoke the &CData1 -method against Some_Class or an instance thereof, just like anybody else. - -To be scrupulously honest, that last statement assumes you haven't packed -several classes together into the same file scope, nor strewn your class -implementation across several different files. Accessibility of those -variables is based uniquely on the static file scope. It has nothing to -do with the package. That means that code in a different file but -the same package (class) could not access those variables, yet code in the -same file but a different package (class) could. There are sound reasons -why we usually suggest a one-to-one mapping between files and packages -and modules and classes. You don't have to stick to this suggestion if -you really know what you're doing, but you're apt to confuse yourself -otherwise, especially at first. - -If you'd like to aggregate your class attributes into one lexically scoped, -composite structure, you're perfectly free to do so. - - package Some_Class; - my %ClassData = ( - CData1 => "", - CData2 => "", - ); - sub CData1 { - shift; # XXX: ignore calling class/object - $ClassData{CData1} = shift if @_; - return $ClassData{CData1}; - } - sub CData2 { - shift; # XXX: ignore calling class/object - $ClassData{CData2} = shift if @_; - return $ClassData{CData2}; - } - -To make this more scalable as other class attributes are added, we can -again register closures into the package symbol table to create accessor -methods for them. - - package Some_Class; - my %ClassData = ( - CData1 => "", - CData2 => "", - ); - for my $datum (keys %ClassData) { - no strict "refs"; - *$datum = sub { - shift; # XXX: ignore calling class/object - $ClassData{$datum} = shift if @_; - return $ClassData{$datum}; - }; - } - -Requiring even your own class to use accessor methods like anybody else is -probably a good thing. But demanding and expecting that everyone else, -be they subclass or superclass, friend or foe, will all come to your -object through mediation is more than just a good idea. It's absolutely -critical to the model. Let there be in your mind no such thing as -"public" data, nor even "protected" data, which is a seductive but -ultimately destructive notion. Both will come back to bite at you. -That's because as soon as you take that first step out of the solid -position in which all state is considered completely private, save from the -perspective of its own accessor methods, you have violated the envelope. -And, having pierced that encapsulating envelope, you shall doubtless -someday pay the price when future changes in the implementation break -unrelated code. Considering that avoiding this infelicitous outcome was -precisely why you consented to suffer the slings and arrows of obsequious -abstraction by turning to object orientation in the first place, such -breakage seems unfortunate in the extreme. - -=head2 More Inheritance Concerns - -Suppose that Some_Class were used as a base class from which to derive -Another_Class. If you invoke a &CData method on the derived class or -on an object of that class, what do you get? Would the derived class -have its own state, or would it piggyback on its base class's versions -of the class attributes? - -The answer is that under the scheme outlined above, the derived class -would B<not> have its own state data. As before, whether you consider -this a good thing or a bad one depends on the semantics of the classes -involved. - -The cleanest, sanest, simplest way to address per-class state in a -lexical is for the derived class to override its base class's version -of the method that accesses the class attributes. Since the actual method -called is the one in the object's derived class if this exists, you -automatically get per-class state this way. Any urge to provide an -unadvertised method to sneak out a reference to the %ClassData hash -should be strenuously resisted. - -As with any other overridden method, the implementation in the -derived class always has the option of invoking its base class's -version of the method in addition to its own. Here's an example: - - package Another_Class; - @ISA = qw(Some_Class); - - my %ClassData = ( - CData1 => "", - ); - - sub CData1 { - my($self, $newvalue) = @_; - if (@_ > 1) { - # set locally first - $ClassData{CData1} = $newvalue; - - # then pass the buck up to the first - # overridden version, if there is one - if ($self->can("SUPER::CData1")) { - $self->SUPER::CData1($newvalue); - } - } - return $ClassData{CData1}; - } - -Those dabbling in multiple inheritance might be concerned -about there being more than one override. - - for my $parent (@ISA) { - my $methname = $parent . "::CData1"; - if ($self->can($methname)) { - $self->$methname($newvalue); - } - } - -Because the &UNIVERSAL::can method returns a reference -to the function directly, you can use this directly -for a significant performance improvement: - - for my $parent (@ISA) { - if (my $coderef = $self->can($parent . "::CData1")) { - $self->$coderef($newvalue); - } - } - -If you override C<UNIVERSAL::can> in your own classes, be sure to return the -reference appropriately. - -=head2 Locking the Door and Throwing Away the Key - -As currently implemented, any code within the same scope as the -file-scoped lexical %ClassData can alter that hash directly. Is that -ok? Is it acceptable or even desirable to allow other parts of the -implementation of this class to access class attributes directly? - -That depends on how careful you want to be. Think back to the Cosmos -class. If the &supernova method had directly altered $Cosmos::Stars or -C<$Cosmos::Cosmos{stars}>, then we wouldn't have been able to reuse the -class when it came to inventing a Multiverse. So letting even the class -itself access its own class attributes without the mediating intervention of -properly designed accessor methods is probably not a good idea after all. - -Restricting access to class attributes from the class itself is usually -not enforceable even in strongly object-oriented languages. But in Perl, -you can. - -Here's one way: - - package Some_Class; - - { # scope for hiding $CData1 - my $CData1; - sub CData1 { - shift; # XXX: unused - $CData1 = shift if @_; - return $CData1; - } - } - - { # scope for hiding $CData2 - my $CData2; - sub CData2 { - shift; # XXX: unused - $CData2 = shift if @_; - return $CData2; - } - } - -No one--absolutely no one--is allowed to read or write the class -attributes without the mediation of the managing accessor method, since -only that method has access to the lexical variable it's managing. -This use of mediated access to class attributes is a form of privacy far -stronger than most OO languages provide. - -The repetition of code used to create per-datum accessor methods chafes -at our Laziness, so we'll again use closures to create similar -methods. - - package Some_Class; - - { # scope for ultra-private meta-object for class attributes - my %ClassData = ( - CData1 => "", - CData2 => "", - ); - - for my $datum (keys %ClassData ) { - no strict "refs"; - *$datum = sub { - use strict "refs"; - my ($self, $newvalue) = @_; - $ClassData{$datum} = $newvalue if @_ > 1; - return $ClassData{$datum}; - } - } - - } - -The closure above can be modified to take inheritance into account using -the &UNIVERSAL::can method and SUPER as shown previously. - -=head2 Translucency Revisited - -The Vermin class demonstrates translucency using a package variable, -eponymously named %Vermin, as its meta-object. If you prefer to -use absolutely no package variables beyond those necessary to appease -inheritance or possibly the Exporter, this strategy is closed to you. -That's too bad, because translucent attributes are an appealing -technique, so it would be valuable to devise an implementation using -only lexicals. - -There's a second reason why you might wish to avoid the eponymous -package hash. If you use class names with double-colons in them, you -would end up poking around somewhere you might not have meant to poke. - - package Vermin; - $class = "Vermin"; - $class->{PopCount}++; - # accesses $Vermin::Vermin{PopCount} - - package Vermin::Noxious; - $class = "Vermin::Noxious"; - $class->{PopCount}++; - # accesses $Vermin::Noxious{PopCount} - -In the first case, because the class name had no double-colons, we got -the hash in the current package. But in the second case, instead of -getting some hash in the current package, we got the hash %Noxious in -the Vermin package. (The noxious vermin just invaded another package and -sprayed their data around it. :-) Perl doesn't support relative packages -in its naming conventions, so any double-colons trigger a fully-qualified -lookup instead of just looking in the current package. - -In practice, it is unlikely that the Vermin class had an existing -package variable named %Noxious that you just blew away. If you're -still mistrustful, you could always stake out your own territory -where you know the rules, such as using Eponymous::Vermin::Noxious or -Hieronymus::Vermin::Boschious or Leave_Me_Alone::Vermin::Noxious as class -names instead. Sure, it's in theory possible that someone else has -a class named Eponymous::Vermin with its own %Noxious hash, but this -kind of thing is always true. There's no arbiter of package names. -It's always the case that globals like @Cwd::ISA would collide if more -than one class uses the same Cwd package. - -If this still leaves you with an uncomfortable twinge of paranoia, -we have another solution for you. There's nothing that says that you -have to have a package variable to hold a class meta-object, either for -monadic classes or for translucent attributes. Just code up the methods -so that they access a lexical instead. - -Here's another implementation of the Vermin class with semantics identical -to those given previously, but this time using no package variables. - - package Vermin; - - - # Here's the class meta-object, eponymously named. - # It holds all class data, and also all instance data - # so the latter can be used for both initialization - # and translucency. it's a template. - my %ClassData = ( - PopCount => 0, # capital for class attributes - color => "beige", # small for instance attributes - ); - - # constructor method - # invoked as class method or object method - sub spawn { - my $obclass = shift; - my $class = ref($obclass) || $obclass; - my $self = {}; - bless($self, $class); - $ClassData{PopCount}++; - # init fields from invoking object, or omit if - # invoking object is the class to provide translucency - %$self = %$obclass if ref $obclass; - return $self; - } - - # translucent accessor for "color" attribute - # invoked as class method or object method - sub color { - my $self = shift; - - # handle class invocation - unless (ref $self) { - $ClassData{color} = shift if @_; - return $ClassData{color} - } - - # handle object invocation - $self->{color} = shift if @_; - if (defined $self->{color}) { # not exists! - return $self->{color}; - } else { - return $ClassData{color}; - } - } - - # class attribute accessor for "PopCount" attribute - # invoked as class method or object method - sub population { - return $ClassData{PopCount}; - } - - # instance destructor; invoked only as object method - sub DESTROY { - $ClassData{PopCount}--; - } - - # detect whether an object attribute is translucent - # (typically?) invoked only as object method - sub is_translucent { - my($self, $attr) = @_; - $self = \%ClassData if !ref $self; - return !defined $self->{$attr}; - } - - # test for presence of attribute in class - # invoked as class method or object method - sub has_attribute { - my($self, $attr) = @_; - return exists $ClassData{$attr}; - } - -=head1 NOTES - -Inheritance is a powerful but subtle device, best used only after careful -forethought and design. Aggregation instead of inheritance is often a -better approach. - -You can't use file-scoped lexicals in conjunction with the SelfLoader -or the AutoLoader, because they alter the lexical scope in which the -module's methods wind up getting compiled. - -The usual mealy-mouthed package-munging doubtless applies to setting -up names of object attributes. For example, C<< $self->{ObData1} >> -should probably be C<< $self->{ __PACKAGE__ . "_ObData1" } >>, but that -would just confuse the examples. - -=head1 SEE ALSO - -L<perltoot>, L<perlobj>, L<perlmod>, and L<perlbot>. - -The Tie::SecureHash and Class::Data::Inheritable modules from CPAN are -worth checking out. - -=head1 AUTHOR AND COPYRIGHT - -Copyright (c) 1999 Tom Christiansen. -All rights reserved. - -This documentation is free; you can redistribute it and/or modify it -under the same terms as Perl itself. - -Irrespective of its distribution, all code examples in this file -are hereby placed into the public domain. You are permitted and -encouraged to use this code in your own programs for fun -or for profit as you see fit. A simple comment in the code giving -credit would be courteous but is not required. - -=head1 ACKNOWLEDGEMENTS - -Russ Allbery, Jon Orwant, Randy Ray, Larry Rosler, Nat Torkington, -and Stephen Warren all contributed suggestions and corrections to this -piece. Thanks especially to Damian Conway for his ideas and feedback, -and without whose indirect prodding I might never have taken the time -to show others how much Perl has to offer in the way of objects once -you start thinking outside the tiny little box that today's "popular" -object-oriented languages enforce. - -=head1 HISTORY +For information on OO programming with Perl, please see L<perlootut> +and L<perlobj>. -Last edit: Sun Feb 4 20:50:28 EST 2001 +=cut diff --git a/gnu/usr.bin/perl/pod/perlunicode.pod b/gnu/usr.bin/perl/pod/perlunicode.pod index 1f4be434da0..77daca34a7d 100644 --- a/gnu/usr.bin/perl/pod/perlunicode.pod +++ b/gnu/usr.bin/perl/pod/perlunicode.pod @@ -11,29 +11,36 @@ implement the Unicode standard or the accompanying technical reports from cover to cover, Perl does support many Unicode features. People who want to learn to use Unicode in Perl, should probably read -L<the Perl Unicode tutorial, perlunitut|perlunitut>, before reading +the L<Perl Unicode tutorial, perlunitut|perlunitut> and +L<perluniintro>, before reading this reference document. +Also, the use of Unicode may present security issues that aren't obvious. +Read L<Unicode Security Considerations|http://www.unicode.org/reports/tr36>. + =over 4 +=item Safest if you "use feature 'unicode_strings'" + +In order to preserve backward compatibility, Perl does not turn +on full internal Unicode support unless the pragma +C<use feature 'unicode_strings'> is specified. (This is automatically +selected if you use C<use 5.012> or higher.) Failure to do this can +trigger unexpected surprises. See L</The "Unicode Bug"> below. + +This pragma doesn't affect I/O, and there are still several places +where Unicode isn't fully supported, such as in filenames. + =item Input and Output Layers Perl knows when a filehandle uses Perl's internal Unicode encodings (UTF-8, or UTF-EBCDIC if in EBCDIC) if the filehandle is opened with -the ":utf8" layer. Other encodings can be converted to Perl's +the ":encoding(utf8)" layer. Other encodings can be converted to Perl's encoding on input or from Perl's encoding on output by use of the ":encoding(...)" layer. See L<open>. To indicate that Perl source itself is in UTF-8, use C<use utf8;>. -=item Regular Expressions - -The regular expression compiler produces polymorphic opcodes. That is, -the pattern adapts to the data and automatically switches to the Unicode -character scheme when presented with data that is internally encoded in -UTF-8, or instead uses a traditional byte scheme when presented with -byte data. - =item C<use utf8> still needed to enable UTF-8/UTF-EBCDIC in scripts As a compatibility measure, the C<use utf8> pragma must be explicitly @@ -68,20 +75,33 @@ See L</"Byte and Character Semantics"> for more details. Beginning with version 5.6, Perl uses logically-wide characters to represent strings internally. -In future, Perl-level operations will be expected to work with -characters rather than bytes. - -However, as an interim compatibility measure, Perl aims to -provide a safe migration path from byte semantics to character -semantics for programs. For operations where Perl can unambiguously -decide that the input data are characters, Perl switches to -character semantics. For operations where this determination cannot -be made without additional information from the user, Perl decides in -favor of compatibility and chooses to use byte semantics. - -Under byte semantics, when C<use locale> is in effect, Perl uses the -semantics associated with the current locale. Absent a C<use locale>, and -absent a C<use feature 'unicode_strings'> pragma, Perl currently uses US-ASCII +Starting in Perl 5.14, Perl-level operations work with +characters rather than bytes within the scope of a +C<L<use feature 'unicode_strings'|feature>> (or equivalently +C<use 5.012> or higher). (This is not true if bytes have been +explicitly requested by C<L<use bytes|bytes>>, nor necessarily true +for interactions with the platform's operating system.) + +For earlier Perls, and when C<unicode_strings> is not in effect, Perl +provides a fairly safe environment that can handle both types of +semantics in programs. For operations where Perl can unambiguously +decide that the input data are characters, Perl switches to character +semantics. For operations where this determination cannot be made +without additional information from the user, Perl decides in favor of +compatibility and chooses to use byte semantics. + +When C<use locale> (but not C<use locale ':not_characters'>) is in +effect, Perl uses the semantics associated with the current locale. +(C<use locale> overrides C<use feature 'unicode_strings'> in the same scope; +while C<use locale ':not_characters'> effectively also selects +C<use feature 'unicode_strings'> in its scope; see L<perllocale>.) +Otherwise, Perl uses the platform's native +byte semantics for characters whose code points are less than 256, and +Unicode semantics for those greater than 255. On EBCDIC platforms, this +is almost seamless, as the EBCDIC code pages that Perl handles are +equivalent to Unicode's first 256 code points. (The exception is that +EBCDIC regular expression case-insensitive matching rules are not as +as robust as Unicode's.) But on ASCII platforms, Perl uses US-ASCII (or Basic Latin in Unicode terminology) byte semantics, meaning that characters whose ordinal numbers are in the range 128 - 255 are undefined except for their ordinal numbers. This means that none have case (upper and lower), nor are any @@ -95,31 +115,12 @@ character data. Such data may come from filehandles, from calls to external programs, from information provided by the system (such as %ENV), or from literals and constants in the source text. -The C<bytes> pragma will always, regardless of platform, force byte -semantics in a particular lexical scope. See L<bytes>. - -The C<use feature 'unicode_strings'> pragma is intended to always, regardless -of platform, force Unicode semantics in a particular lexical scope. In -release 5.12, it is partially implemented, applying only to case changes. -See L</The "Unicode Bug"> below. - The C<utf8> pragma is primarily a compatibility device that enables recognition of UTF-(8|EBCDIC) in literals encountered by the parser. Note that this pragma is only required while Perl defaults to byte semantics; when character semantics become the default, this pragma may become a no-op. See L<utf8>. -Unless explicitly stated, Perl operators use character semantics -for Unicode data and byte semantics for non-Unicode data. -The decision to use character semantics is made transparently. If -input data comes from a Unicode source--for example, if a character -encoding layer is added to a filehandle or a literal Unicode -string constant appears in a program--character semantics apply. -Otherwise, byte semantics are in effect. The C<bytes> pragma should -be used to force byte semantics on Unicode data, and the C<use feature -'unicode_strings'> pragma to force Unicode semantics on byte data (though in -5.12 it isn't fully implemented). - If strings operating under byte semantics and strings with Unicode character data are concatenated, the new string will have character semantics. This can cause surprises: See L</BUGS>, below. @@ -156,15 +157,17 @@ Alternatively, you can use the C<\x{...}> notation for characters 0x100 and above. For characters below 0x100 you may get byte semantics instead of character semantics; see L</The "Unicode Bug">. On EBCDIC machines there is the additional problem that the value for such characters gives the EBCDIC -character rather than the Unicode one. +character rather than the Unicode one, thus it is more portable to use +C<\N{U+...}> instead. -Additionally, if you +Additionally, you can use the C<\N{...}> notation and put the official +Unicode character name within the braces, such as +C<\N{WHITE SMILING FACE}>. This automatically loads the L<charnames> +module with the C<:full> and C<:short> options. If you prefer different +options for this module, you can instead, before the C<\N{...}>, +explicitly load it with your desired options; for example, - use charnames ':full'; - -you can use the C<\N{...}> notation and put the official Unicode -character name within the braces, such as C<\N{WHITE SMILING FACE}>. -See L<charnames>. + use charnames ':loose'; =item * @@ -180,15 +183,15 @@ a character instead of a byte. =item * -Character classes in regular expressions match characters instead of +Bracketed character classes in regular expressions match characters instead of bytes and match against the character properties specified in the Unicode properties database. C<\w> can be used to match a Japanese ideograph, for instance. =item * -Named Unicode properties, scripts, and block ranges may be used like -character classes via the C<\p{}> "matches property" construct and +Named Unicode properties, scripts, and block ranges may be used (like bracketed +character classes) by using the C<\p{}> "matches property" construct and the C<\P{}> negation, "doesn't match property". See L</"Unicode Character Properties"> for more details. @@ -261,9 +264,13 @@ complement B<and> the full character-wide bit complement. =item * -You can define your own mappings to be used in lc(), -lcfirst(), uc(), and ucfirst() (or their string-inlined versions). -See L</"User-Defined Case Mappings"> for more details. +There is a CPAN module, L<Unicode::Casing>, which allows you to define +your own mappings to be used in C<lc()>, C<lcfirst()>, C<uc()>, +C<ucfirst()>, and C<fc> (or their double-quoted string inlined +versions such as C<\U>). +(Prior to Perl 5.16, this functionality was partially provided +in the Perl core, but suffered from a number of insurmountable +drawbacks, so the CPAN module was written instead.) =back @@ -277,26 +284,32 @@ And finally, C<scalar reverse()> reverses by character rather than by byte. =head2 Unicode Character Properties -Most Unicode character properties are accessible by using regular expressions. -They are used like character classes via the C<\p{}> "matches property" -construct and the C<\P{}> negation, "doesn't match property". +(The only time that Perl considers a sequence of individual code +points as a single logical character is in the C<\X> construct, already +mentioned above. Therefore "character" in this discussion means a single +Unicode code point.) + +Very nearly all Unicode character properties are accessible through +regular expressions by using the C<\p{}> "matches property" construct +and the C<\P{}> "doesn't match property" for its negation. -For instance, C<\p{Uppercase}> matches any character with the Unicode +For instance, C<\p{Uppercase}> matches any single character with the Unicode "Uppercase" property, while C<\p{L}> matches any character with a General_Category of "L" (letter) property. Brackets are not -required for single letter properties, so C<\p{L}> is equivalent to C<\pL>. +required for single letter property names, so C<\p{L}> is equivalent to C<\pL>. -More formally, C<\p{Uppercase}> matches any character whose Unicode Uppercase -property value is True, and C<\P{Uppercase}> matches any character whose -Uppercase property value is False, and they could have been written as -C<\p{Uppercase=True}> and C<\p{Uppercase=False}>, respectively +More formally, C<\p{Uppercase}> matches any single character whose Unicode +Uppercase property value is True, and C<\P{Uppercase}> matches any character +whose Uppercase property value is False, and they could have been written as +C<\p{Uppercase=True}> and C<\p{Uppercase=False}>, respectively. -This formality is needed when properties are not binary, that is if they can +This formality is needed when properties are not binary; that is, if they can take on more values than just True and False. For example, the Bidi_Class (see -L</"Bidirectional Character Types"> below), can take on a number of different +L</"Bidirectional Character Types"> below), can take on several different values, such as Left, Right, Whitespace, and others. To match these, one needs -to specify the property name (Bidi_Class), and the value being matched against -(Left, Right, I<etc.>). This is done, as in the examples above, by having the +to specify both the property name (Bidi_Class), AND the value being +matched against +(Left, Right, etc.). This is done, as in the examples above, by having the two components separated by an equal sign (or interchangeably, a colon), like C<\p{Bidi_Class: Left}>. @@ -308,9 +321,9 @@ below, in which you may omit the property name and the equals or colon separator. Most Unicode character properties have at least two synonyms (or aliases if you -prefer), a short one that is easier to type, and a longer one which is more -descriptive and hence it is easier to understand what it means. Thus the "L" -and "Letter" above are equivalent and can be used interchangeably. Likewise, +prefer): a short one that is easier to type and a longer one that is more +descriptive and hence easier to understand. Thus the "L" and "Letter" properties +above are equivalent and can be used interchangeably. Likewise, "Upper" is a synonym for "Uppercase", and we could have written C<\p{Uppercase}> equivalently as C<\p{Upper}>. Also, there are typically various synonyms for the values the property can be. For binary properties, @@ -321,24 +334,51 @@ General_Category property, "L" means "Letter", but for the Bidi_Class property, "L" means "Left". A complete list of properties and synonyms is in L<perluniprops>. -Upper/lower case differences in the property names and values are irrelevant, +Upper/lower case differences in property names and values are irrelevant; thus C<\p{Upper}> means the same thing as C<\p{upper}> or even C<\p{UpPeR}>. Similarly, you can add or subtract underscores anywhere in the middle of a word, so that these are also equivalent to C<\p{U_p_p_e_r}>. And white space is irrelevant adjacent to non-word characters, such as the braces and the equals -or colon separators so C<\p{ Upper }> and C<\p{ Upper_case : Y }> are -equivalent to these as well. In fact, in most cases, white space and even -hyphens can be added or deleted anywhere. So even C<\p{ Up-per case = Yes}> is +or colon separators, so C<\p{ Upper }> and C<\p{ Upper_case : Y }> are +equivalent to these as well. In fact, white space and even +hyphens can usually be added or deleted anywhere. So even C<\p{ Up-per case = Yes}> is equivalent. All this is called "loose-matching" by Unicode. The few places -where stricter matching is employed is in the middle of numbers, and the Perl +where stricter matching is used is in the middle of numbers, and in the Perl extension properties that begin or end with an underscore. Stricter matching -cares about white space (except adjacent to the non-word characters) and +cares about white space (except adjacent to non-word characters), hyphens, and non-interior underscores. You can also use negation in both C<\p{}> and C<\P{}> by introducing a caret (^) between the first brace and the property name: C<\p{^Tamil}> is equal to C<\P{Tamil}>. +Almost all properties are immune to case-insensitive matching. That is, +adding a C</i> regular expression modifier does not change what they +match. There are two sets that are affected. +The first set is +C<Uppercase_Letter>, +C<Lowercase_Letter>, +and C<Titlecase_Letter>, +all of which match C<Cased_Letter> under C</i> matching. +And the second set is +C<Uppercase>, +C<Lowercase>, +and C<Titlecase>, +all of which match C<Cased> under C</i> matching. +This set also includes its subsets C<PosixUpper> and C<PosixLower> both +of which under C</i> matching match C<PosixAlpha>. +(The difference between these sets is that some things, such as Roman +numerals, come in both upper and lower case so they are C<Cased>, but aren't considered +letters, so they aren't C<Cased_Letter>s.) + +The result is undefined if you try to match a non-Unicode code point +(that is, one above 0x10FFFF) against a Unicode property. Currently, a +warning is raised, and the match will fail. In some cases, this is +counterintuitive, as both these fail: + + chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails. + chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails! + =head3 B<General_Category> Every Unicode character is assigned a general category, which is the "most @@ -395,26 +435,20 @@ Here are the short and long forms of the General Category properties: Zp Paragraph_Separator C Other - Cc Control (also Cntrl) + Cc Control (also Cntrl) Cf Format - Cs Surrogate (not usable) + Cs Surrogate Co Private_Use Cn Unassigned Single-letter properties match all characters in any of the two-letter sub-properties starting with the same letter. -C<LC> and C<L&> are special cases, which are aliases for the set of -C<Ll>, C<Lu>, and C<Lt>. - -Because Perl hides the need for the user to understand the internal -representation of Unicode characters, there is no need to implement -the somewhat messy concept of surrogates. C<Cs> is therefore not -supported. +C<LC> and C<L&> are special: both are aliases for the set consisting of everything matched by C<Ll>, C<Lu>, and C<Lt>. =head3 B<Bidirectional Character Types> -Because scripts differ in their directionality--Hebrew is -written right to left, for example--Unicode supplies these properties in +Because scripts differ in their directionality (Hebrew and Arabic are +written right to left, for example) Unicode supplies these properties in the Bidi_Class class: Property Meaning @@ -445,16 +479,68 @@ written right to left. =head3 B<Scripts> -The world's languages are written in a number of scripts. This sentence +The world's languages are written in many different scripts. This sentence (unless you're reading it in translation) is written in Latin, while Russian is -written in Cyrllic, and Greek is written in, well, Greek; Japanese mainly in +written in Cyrillic, and Greek is written in, well, Greek; Japanese mainly in Hiragana or Katakana. There are many more. -The Unicode Script property gives what script a given character is in, -and can be matched with the compound form like C<\p{Script=Hebrew}> (short: -C<\p{sc=hebr}>). Perl furnishes shortcuts for all script names. You can omit -everything up through the equals (or colon), and simply write C<\p{Latin}> or -C<\P{Cyrillic}>. +The Unicode Script and Script_Extensions properties give what script a +given character is in. Either property can be specified with the +compound form like +C<\p{Script=Hebrew}> (short: C<\p{sc=hebr}>), or +C<\p{Script_Extensions=Javanese}> (short: C<\p{scx=java}>). +In addition, Perl furnishes shortcuts for all +C<Script> property names. You can omit everything up through the equals +(or colon), and simply write C<\p{Latin}> or C<\P{Cyrillic}>. +(This is not true for C<Script_Extensions>, which is required to be +written in the compound form.) + +The difference between these two properties involves characters that are +used in multiple scripts. For example the digits '0' through '9' are +used in many parts of the world. These are placed in a script named +C<Common>. Other characters are used in just a few scripts. For +example, the "KATAKANA-HIRAGANA DOUBLE HYPHEN" is used in both Japanese +scripts, Katakana and Hiragana, but nowhere else. The C<Script> +property places all characters that are used in multiple scripts in the +C<Common> script, while the C<Script_Extensions> property places those +that are used in only a few scripts into each of those scripts; while +still using C<Common> for those used in many scripts. Thus both these +match: + + "0" =~ /\p{sc=Common}/ # Matches + "0" =~ /\p{scx=Common}/ # Matches + +and only the first of these match: + + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{sc=Common} # Matches + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{scx=Common} # No match + +And only the last two of these match: + + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{sc=Hiragana} # No match + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{sc=Katakana} # No match + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{scx=Hiragana} # Matches + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{scx=Katakana} # Matches + +C<Script_Extensions> is thus an improved C<Script>, in which there are +fewer characters in the C<Common> script, and correspondingly more in +other scripts. It is new in Unicode version 6.0, and its data are likely +to change significantly in later releases, as things get sorted out. + +(Actually, besides C<Common>, the C<Inherited> script, contains +characters that are used in multiple scripts. These are modifier +characters which modify other characters, and inherit the script value +of the controlling character. Some of these are used in many scripts, +and so go into C<Inherited> in both C<Script> and C<Script_Extensions>. +Others are used in just a few scripts, so are in C<Inherited> in +C<Script>, but not in C<Script_Extensions>.) + +It is worth stressing that there are several different sets of digits in +Unicode that are equivalent to 0-9 and are matchable by C<\d> in a +regular expression. If they are used in a single language only, they +are in that language's C<Script> and C<Script_Extension>. If they are +used in more than one script, they will be in C<sc=Common>, but only +if they are used in many scripts should they be in C<scx=Common>. A complete list of scripts and their shortcuts is in L<perluniprops>. @@ -472,25 +558,24 @@ characters. The difference between scripts and blocks is that the concept of scripts is closer to natural languages, while the concept of blocks is more of an artificial grouping based on groups of Unicode characters with consecutive ordinal values. For example, the "Basic Latin" -block is all characters whose ordinals are between 0 and 127, inclusive, in +block is all characters whose ordinals are between 0 and 127, inclusive; in other words, the ASCII characters. The "Latin" script contains some letters -from this block as well as several more, like "Latin-1 Supplement", -"Latin Extended-A", I<etc.>, but it does not contain all the characters from -those blocks. It does not, for example, contain digits, because digits are -shared across many scripts. Digits and similar groups, like punctuation, are in -the script called C<Common>. There is also a script called C<Inherited> for -characters that modify other characters, and inherit the script value of the -controlling character. +from this as well as several other blocks, like "Latin-1 Supplement", +"Latin Extended-A", etc., but it does not contain all the characters from +those blocks. It does not, for example, contain the digits 0-9, because +those digits are shared across many scripts, and hence are in the +C<Common> script. For more about scripts versus blocks, see UAX#24 "Unicode Script Property": L<http://www.unicode.org/reports/tr24> -The Script property is likely to be the one you want to use when processing -natural language; the Block property may be useful in working with the nuts and -bolts of Unicode. +The C<Script> or C<Script_Extensions> properties are likely to be the +ones you want to use when processing +natural language; the Block property may occasionally be useful in working +with the nuts and bolts of Unicode. Block names are matched in the compound form, like C<\p{Block: Arrows}> or -C<\p{Blk=Hebrew}>. Unlike most other properties only a few block names have a +C<\p{Blk=Hebrew}>. Unlike most other properties, only a few block names have a Unicode-defined short name. But Perl does provide a (slight) shortcut: You can say, for example C<\p{In_Arrows}> or C<\p{In_Hebrew}>. For backwards compatibility, the C<In> prefix may be omitted if there is no naming conflict @@ -515,10 +600,10 @@ doesn't. =back -Some people just prefer to always use C<\p{Block: foo}> and C<\p{Script: bar}> -instead of the shortcuts, for clarity, and because they can't remember the -difference between 'In' and 'Is' anyway (or aren't confident that those who -eventually will read their code will know). +Some people prefer to always use C<\p{Block: foo}> and C<\p{Script: bar}> +instead of the shortcuts, whether for clarity, because they can't remember the +difference between 'In' and 'Is' anyway, or they aren't confident that those who +eventually will read their code will know that difference. A complete list of blocks and their shortcuts is in L<perluniprops>. @@ -528,13 +613,14 @@ There are many more properties than the very basic ones described here. A complete list is in L<perluniprops>. Unicode defines all its properties in the compound form, so all single-form -properties are Perl extensions. A number of these are just synonyms for the -Unicode ones, but some are genunine extensions, including a couple that are in +properties are Perl extensions. Most of these are just synonyms for the +Unicode ones, but some are genuine extensions, including several that are in the compound form. And quite a few of these are actually recommended by Unicode (in L<http://www.unicode.org/reports/tr18>). -This section gives some details on all the extensions that aren't synonyms for -compound-form Unicode properties (for those, you'll have to refer to the +This section gives some details on all extensions that aren't just +synonyms for compound-form Unicode properties +(for those properties, you'll have to refer to the L<Unicode Standard|http://www.unicode.org/reports/tr44>. =over @@ -553,6 +639,11 @@ This matches any C<\p{Alphabetic}> or C<\p{Decimal_Number}> character. This matches any of the 1_114_112 Unicode code points. It is a synonym for C<\p{All}>. +=item B<C<\p{ASCII}>> + +This matches any of the 128 characters in the US-ASCII character set, +which is a subset of Unicode. + =item B<C<\p{Assigned}>> This matches any assigned code point; that is, any code point whose general @@ -571,47 +662,47 @@ To understand the use of this rarely used property=value combination, it is necessary to know some basics about decomposition. Consider a character, say H. It could appear with various marks around it, such as an acute accent, or a circumflex, or various hooks, circles, arrows, -I<etc.>, above, below, to one side and/or the other, I<etc.> There are many +I<etc.>, above, below, to one side or the other, etc. There are many possibilities among the world's languages. The number of combinations is astronomical, and if there were a character for each combination, it would soon exhaust Unicode's more than a million possible characters. So Unicode took a different approach: there is a character for the base H, and a -character for each of the possible marks, and they can be combined variously +character for each of the possible marks, and these can be variously combined to get a final logical character. So a logical character--what appears to be a single character--can be a sequence of more than one individual characters. -This is called an "extended grapheme cluster". (Perl furnishes the C<\X> -construct to match such sequences.) +This is called an "extended grapheme cluster"; Perl furnishes the C<\X> +regular expression construct to match such sequences. But Unicode's intent is to unify the existing character set standards and -practices, and a number of pre-existing standards have single characters that +practices, and several pre-existing standards have single characters that mean the same thing as some of these combinations. An example is ISO-8859-1, which has quite a few of these in the Latin-1 range, an example being "LATIN CAPITAL LETTER E WITH ACUTE". Because this character was in this pre-existing standard, Unicode added it to its repertoire. But this character is considered -by Unicode to be equivalent to the sequence consisting of first the character -"LATIN CAPITAL LETTER E", then the character "COMBINING ACUTE ACCENT". +by Unicode to be equivalent to the sequence consisting of the character +"LATIN CAPITAL LETTER E" followed by the character "COMBINING ACUTE ACCENT". "LATIN CAPITAL LETTER E WITH ACUTE" is called a "pre-composed" character, and -the equivalence with the sequence is called canonical equivalence. All +its equivalence with the sequence is called canonical equivalence. All pre-composed characters are said to have a decomposition (into the equivalent -sequence) and the decomposition type is also called canonical. +sequence), and the decomposition type is also called canonical. However, many more characters have a different type of decomposition, a "compatible" or "non-canonical" decomposition. The sequences that form these decompositions are not considered canonically equivalent to the pre-composed character. An example, again in the Latin-1 range, is the "SUPERSCRIPT ONE". -It is kind of like a regular digit 1, but not exactly; its decomposition +It is somewhat like a regular digit 1, but not exactly; its decomposition into the digit 1 is called a "compatible" decomposition, specifically a "super" decomposition. There are several such compatibility decompositions (see L<http://www.unicode.org/reports/tr44>), including one -called "compat" which means some miscellaneous type of decomposition -that doesn't fit into the decomposition categories that Unicode has chosen. +called "compat", which means some miscellaneous type of decomposition +that doesn't fit into the decomposition categories that Unicode has chosen. Note that most Unicode characters don't have a decomposition, so their decomposition type is "None". -Perl has added the C<Non_Canonical> type, for your convenience, to mean any of -the compatibility decompositions. +For your convenience, Perl has added the C<Non_Canonical> decomposition +type to mean any of the several compatibility decompositions. =item B<C<\p{Graph}>> @@ -620,10 +711,10 @@ that on a printer would cause ink to be used. =item B<C<\p{HorizSpace}>> -This is the same as C<\h> and C<\p{Blank}>: A character that changes the +This is the same as C<\h> and C<\p{Blank}>: a character that changes the spacing horizontally. -=item B<C<\p{In=*}>> +=item B<C<\p{In=*}>> This is a synonym for C<\p{Present_In=*}> @@ -639,56 +730,11 @@ This is the same as C<\w>, restricted to ASCII, namely C<[A-Za-z0-9_]> Mnemonic: Perl's (original) word. -=item B<C<\p{PosixAlnum}>> - -This matches any alphanumeric character in the ASCII range, namely -C<[A-Za-z0-9]>. - -=item B<C<\p{PosixAlpha}>> - -This matches any alphabetic character in the ASCII range, namely C<[A-Za-z]>. - -=item B<C<\p{PosixBlank}>> - -This matches any blank character in the ASCII range, namely C<S<[ \t]>>. +=item B<C<\p{Posix...}>> -=item B<C<\p{PosixCntrl}>> - -This matches any control character in the ASCII range, namely C<[\x00-\x1F\x7F]> - -=item B<C<\p{PosixDigit}>> - -This matches any digit character in the ASCII range, namely C<[0-9]>. - -=item B<C<\p{PosixGraph}>> - -This matches any graphical character in the ASCII range, namely C<[\x21-\x7E]>. - -=item B<C<\p{PosixLower}>> - -This matches any lowercase character in the ASCII range, namely C<[a-z]>. - -=item B<C<\p{PosixPrint}>> - -This matches any printable character in the ASCII range, namely C<[\x20-\x7E]>. -These are the graphical characters plus SPACE. - -=item B<C<\p{PosixPunct}>> - -This matches any punctuation character in the ASCII range, namely -C<[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]>. These are the -graphical characters that aren't word characters. Note that the Posix standard -includes in its definition of punctuation, those characters that Unicode calls -"symbols." - -=item B<C<\p{PosixSpace}>> - -This matches any space character in the ASCII range, namely -C<S<[ \f\n\r\t\x0B]>> (the last being a vertical tab). - -=item B<C<\p{PosixUpper}>> - -This matches any uppercase character in the ASCII range, namely C<[A-Z]>. +There are several of these, which are equivalents using the C<\p> +notation for Posix classes and are described in +L<perlrecharclass/POSIX Character Classes>. =item B<C<\p{Present_In: *}>> (Short: C<\p{In=*}>) @@ -717,13 +763,12 @@ Some non-Perl implementations of the Age property may change its meaning to be the same as the Perl Present_In property; just be aware of that. Another confusion with both these properties is that the definition is not -that the code point has been assigned, but that the meaning of the code point -has been determined. This is because 66 code points will always be -unassigned, and, so the Age for them is the Unicode version the decision to -make them so was made in. For example, C<U+FDD0> is to be permanently +that the code point has been I<assigned>, but that the meaning of the code point +has been I<determined>. This is because 66 code points will always be +unassigned, and so the Age for them is the Unicode version in which the decision +to make them so was made. For example, C<U+FDD0> is to be permanently unassigned to a character, and the decision to do that was made in version 3.1, -so C<\p{Age=3.1}> matches this character and C<\p{Present_In: 3.1}> and up -matches as well. +so C<\p{Age=3.1}> matches this character, as also does C<\p{Present_In: 3.1}> and up. =item B<C<\p{Print}>> @@ -734,7 +779,14 @@ This matches any character that is graphical or blank, except controls. This is the same as C<\s>, including beyond ASCII. Mnemonic: Space, as modified by Perl. (It doesn't include the vertical tab -which both the Posix standard and Unicode consider to be space.) +which both the Posix standard and Unicode consider white space.) + +=item B<C<\p{Title}>> and B<C<\p{Titlecase}>> + +Under case-sensitive matching, these both match the same code points as +C<\p{General Category=Titlecase_Letter}> (C<\p{gc=lt}>). The difference +is that under C</i> caseless matching, these match the same as +C<\p{Cased}>, whereas C<\p{gc=lt}> matches C<\p{Cased_Letter>). =item B<C<\p{VertSpace}>> @@ -742,7 +794,13 @@ This is the same as C<\v>: A character that changes the spacing vertically. =item B<C<\p{Word}>> -This is the same as C<\w>, including beyond ASCII. +This is the same as C<\w>, including over 100_000 characters beyond ASCII. + +=item B<C<\p{XPosix...}>> + +There are several of these, which are the standard Posix classes +extended to the full Unicode range. They are described in +L<perlrecharclass/POSIX Character Classes>. =back @@ -764,6 +822,16 @@ C<\p> or C<\P> construct. Note that the effect is compile-time and immutable once defined. +However, the subroutines are passed a single parameter, which is 0 if +case-sensitive matching is in effect and non-zero if caseless matching +is in effect. The subroutine may return different values depending on +the value of the flag, and one set of values will immutably be in effect +for all case-sensitive matches, and the other set for all case-insensitive +matches. + +Note that if the regular expression is tainted, then Perl will die rather +than calling the subroutine, where the name of the subroutine is +determined by the tainted data. The subroutines must return a specially-formatted string, with one or more newline-separated lines. Each line must be one of the following: @@ -782,28 +850,32 @@ tabular characters) denoting a range of Unicode code points to include. =item * Something to include, prefixed by "+": a built-in character -property (prefixed by "utf8::") or a user-defined character property, +property (prefixed by "utf8::") or a fully qualified (including package +name) user-defined character property, to represent all the characters in that property; two hexadecimal code points for a range; or a single hexadecimal code point. =item * Something to exclude, prefixed by "-": an existing character -property (prefixed by "utf8::") or a user-defined character property, +property (prefixed by "utf8::") or a fully qualified (including package +name) user-defined character property, to represent all the characters in that property; two hexadecimal code points for a range; or a single hexadecimal code point. =item * Something to negate, prefixed "!": an existing character -property (prefixed by "utf8::") or a user-defined character property, +property (prefixed by "utf8::") or a fully qualified (including package +name) user-defined character property, to represent all the characters in that property; two hexadecimal code points for a range; or a single hexadecimal code point. =item * Something to intersect with, prefixed by "&": an existing character -property (prefixed by "utf8::") or a user-defined character property, +property (prefixed by "utf8::") or a fully qualified (including package +name) user-defined character property, for all the characters except the characters in the property; two hexadecimal code points for a range; or a single hexadecimal code point. @@ -813,7 +885,7 @@ For example, to define a property that covers both the Japanese syllabaries (hiragana and katakana), you can define sub InKana { - return <<END; + return <<END; 3040\t309F 30A0\t30FF END @@ -825,7 +897,7 @@ Now you can use C<\p{InKana}> and C<\P{InKana}>. You could also have used the existing block property names: sub InKana { - return <<'END'; + return <<'END'; +utf8::InHiragana +utf8::InKatakana END @@ -836,7 +908,7 @@ not the raw block ranges: in other words, you want to remove the non-characters: sub InKana { - return <<'END'; + return <<'END'; +utf8::InHiragana +utf8::InKatakana -utf8::IsCn @@ -846,64 +918,45 @@ the non-characters: The negation is useful for defining (surprise!) negated classes. sub InNotKana { - return <<'END'; + return <<'END'; !utf8::InHiragana -utf8::InKatakana +utf8::IsCn END } -Intersection is useful for getting the common characters matched by -two (or more) classes. +This will match all non-Unicode code points, since every one of them is +not in Kana. You can use intersection to exclude these, if desired, as +this modified example shows: - sub InFooAndBar { + sub InNotKana { return <<'END'; - +main::Foo - &main::Bar + !utf8::InHiragana + -utf8::InKatakana + +utf8::IsCn + &utf8::Any END } -It's important to remember not to use "&" for the first set; that -would be intersecting with nothing (resulting in an empty set). - -=head2 User-Defined Case Mappings - -You can also define your own mappings to be used in the lc(), -lcfirst(), uc(), and ucfirst() (or their string-inlined versions). -The principle is similar to that of user-defined character -properties: to define subroutines -with names like C<ToLower> (for lc() and lcfirst()), C<ToTitle> (for -the first character in ucfirst()), and C<ToUpper> (for uc(), and the -rest of the characters in ucfirst()). - -The string returned by the subroutines needs to be two hexadecimal numbers -separated by two tabulators: the two numbers being, respectively, the source -code point and the destination code point. For example: - - sub ToUpper { - return <<END; - 0061\t\t0041 - END - } +C<&utf8::Any> must be the last line in the definition. -defines an uc() mapping that causes only the character "a" -to be mapped to "A"; all other characters will remain unchanged. +Intersection is used generally for getting the common characters matched +by two (or more) classes. It's important to remember not to use "&" for +the first set; that would be intersecting with nothing, resulting in an +empty set. -(For serious hackers only) The above means you have to furnish a complete -mapping; you can't just override a couple of characters and leave the rest -unchanged. You can find all the mappings in the directory -C<$Config{privlib}>/F<unicore/To/>. The mapping data is returned as the -here-document, and the C<utf8::ToSpecFoo> are special exception mappings -derived from <$Config{privlib}>/F<unicore/SpecialCasing.txt>. The "Digit" and -"Fold" mappings that one can see in the directory are not directly -user-accessible, one can use either the C<Unicode::UCD> module, or just match -case-insensitively (that's when the "Fold" mapping is used). +(Note that official Unicode properties differ from these in that they +automatically exclude non-Unicode code points and a warning is raised if +a match is attempted on one of those.) -The mappings will only take effect on scalars that have been marked as having -Unicode characters, for example by using C<utf8::upgrade()>. -Old byte-style strings are not affected. +=head2 User-Defined Case Mappings (for serious hackers only) -The mappings are in effect for the package they are defined in. +B<This feature has been removed as of Perl 5.16.> +The CPAN module L<Unicode::Casing> provides better functionality without +the drawbacks that this feature had. If you are using a Perl earlier +than 5.16, this feature was most fully documented in the 5.14 version of +this pod: +L<http://perldoc.perl.org/5.14.0/perlunicode.html#User-Defined-Case-Mappings-%28for-serious-hackers-only%29> =head2 Character Encodings for Input and Output @@ -911,10 +964,10 @@ See L<Encode>. =head2 Unicode Regular Expression Support Level -The following list of Unicode support for regular expressions describes -all the features currently supported. The references to "Level N" +The following list of Unicode supported features for regular expressions describes +all features currently directly supported by core Perl. The references to "Level N" and the section numbers refer to the Unicode Technical Standard #18, -"Unicode Regular Expressions", version 11, in May 2005. +"Unicode Regular Expressions", version 13, from August 2008. =over 4 @@ -922,36 +975,41 @@ and the section numbers refer to the Unicode Technical Standard #18, Level 1 - Basic Unicode Support - RL1.1 Hex Notation - done [1] - RL1.2 Properties - done [2][3] - RL1.2a Compatibility Properties - done [4] - RL1.3 Subtraction and Intersection - MISSING [5] - RL1.4 Simple Word Boundaries - done [6] - RL1.5 Simple Loose Matches - done [7] - RL1.6 Line Boundaries - MISSING [8] - RL1.7 Supplementary Code Points - done [9] - - [1] \x{...} - [2] \p{...} \P{...} - [3] supports not only minimal list, but all Unicode character - properties (see L</Unicode Character Properties>) - [4] \d \D \s \S \w \W \X [:prop:] [:^prop:] - [5] can use regular expression look-ahead [a] or - user-defined character properties [b] to emulate set operations - [6] \b \B - [7] note that Perl does Full case-folding in matching (but with bugs), - not Simple: for example U+1F88 is equivalent to U+1F00 U+03B9, - not with 1F80. This difference matters mainly for certain Greek - capital letters with certain modifiers: the Full case-folding - decomposes the letter, while the Simple case-folding would map - it to a single character. - [8] should do ^ and $ also on U+000B (\v in C), FF (\f), CR (\r), - CRLF (\r\n), NEL (U+0085), LS (U+2028), and PS (U+2029); - should also affect <>, $., and script line numbers; - should not split lines within CRLF [c] (i.e. there is no empty - line between \r and \n) - [9] UTF-8/UTF-EBDDIC used in perl allows not only U+10000 to U+10FFFF - but also beyond U+10FFFF [d] + RL1.1 Hex Notation - done [1] + RL1.2 Properties - done [2][3] + RL1.2a Compatibility Properties - done [4] + RL1.3 Subtraction and Intersection - MISSING [5] + RL1.4 Simple Word Boundaries - done [6] + RL1.5 Simple Loose Matches - done [7] + RL1.6 Line Boundaries - MISSING [8][9] + RL1.7 Supplementary Code Points - done [10] + + [1] \x{...} + [2] \p{...} \P{...} + [3] supports not only minimal list, but all Unicode character + properties (see Unicode Character Properties above) + [4] \d \D \s \S \w \W \X [:prop:] [:^prop:] + [5] can use regular expression look-ahead [a] or + user-defined character properties [b] to emulate set + operations + [6] \b \B + [7] note that Perl does Full case-folding in matching (but with + bugs), not Simple: for example U+1F88 is equivalent to + U+1F00 U+03B9, instead of just U+1F80. This difference + matters mainly for certain Greek capital letters with certain + modifiers: the Full case-folding decomposes the letter, + while the Simple case-folding would map it to a single + character. + [8] should do ^ and $ also on U+000B (\v in C), FF (\f), CR + (\r), CRLF (\r\n), NEL (U+0085), LS (U+2028), and PS + (U+2029); should also affect <>, $., and script line + numbers; should not split lines within CRLF [c] (i.e. there + is no empty line between \r and \n) + [9] Linebreaking conformant with UAX#14 "Unicode Line Breaking + Algorithm" is available through the Unicode::LineBreaking + module. + [10] UTF-8/UTF-EBDDIC used in Perl allows not only U+10000 to + U+10FFFF but also beyond U+10FFFF [a] You can mimic class subtraction using lookahead. For example, what UTS#18 might write as @@ -969,7 +1027,7 @@ But in this particular example, you probably really want which will match assigned characters known to be part of the Greek script. -Also see the Unicode::Regex::Set module, it does implement the full +Also see the L<Unicode::Regex::Set> module; it does implement the full UTS#18 grouping, intersection, union, and removal (subtraction) syntax. [b] '+' for union, '-' for removal (set-difference), '&' for intersection @@ -977,53 +1035,47 @@ UTS#18 grouping, intersection, union, and removal (subtraction) syntax. [c] Try the C<:crlf> layer (see L<PerlIO>). -[d] U+FFFF will currently generate a warning message if 'utf8' warnings are - enabled - =item * Level 2 - Extended Unicode Support - RL2.1 Canonical Equivalents - MISSING [10][11] - RL2.2 Default Grapheme Clusters - MISSING [12] - RL2.3 Default Word Boundaries - MISSING [14] - RL2.4 Default Loose Matches - MISSING [15] - RL2.5 Name Properties - MISSING [16] - RL2.6 Wildcard Properties - MISSING - - [10] see UAX#15 "Unicode Normalization Forms" - [11] have Unicode::Normalize but not integrated to regexes - [12] have \X but we don't have a "Grapheme Cluster Mode" - [14] see UAX#29, Word Boundaries - [15] see UAX#21 "Case Mappings" - [16] have \N{...} but neither compute names of CJK Ideographs - and Hangul Syllables nor use a loose match [e] + RL2.1 Canonical Equivalents - MISSING [10][11] + RL2.2 Default Grapheme Clusters - MISSING [12] + RL2.3 Default Word Boundaries - MISSING [14] + RL2.4 Default Loose Matches - MISSING [15] + RL2.5 Name Properties - DONE + RL2.6 Wildcard Properties - MISSING -[e] C<\N{...}> allows namespaces (see L<charnames>). + [10] see UAX#15 "Unicode Normalization Forms" + [11] have Unicode::Normalize but not integrated to regexes + [12] have \X but we don't have a "Grapheme Cluster Mode" + [14] see UAX#29, Word Boundaries + [15] This is covered in Chapter 3.13 (in Unicode 6.0) =item * Level 3 - Tailored Support - RL3.1 Tailored Punctuation - MISSING - RL3.2 Tailored Grapheme Clusters - MISSING [17][18] - RL3.3 Tailored Word Boundaries - MISSING - RL3.4 Tailored Loose Matches - MISSING - RL3.5 Tailored Ranges - MISSING - RL3.6 Context Matching - MISSING [19] - RL3.7 Incremental Matches - MISSING + RL3.1 Tailored Punctuation - MISSING + RL3.2 Tailored Grapheme Clusters - MISSING [17][18] + RL3.3 Tailored Word Boundaries - MISSING + RL3.4 Tailored Loose Matches - MISSING + RL3.5 Tailored Ranges - MISSING + RL3.6 Context Matching - MISSING [19] + RL3.7 Incremental Matches - MISSING ( RL3.8 Unicode Set Sharing ) - RL3.9 Possible Match Sets - MISSING - RL3.10 Folded Matching - MISSING [20] - RL3.11 Submatchers - MISSING - - [17] see UAX#10 "Unicode Collation Algorithms" - [18] have Unicode::Collate but not integrated to regexes - [19] have (?<=x) and (?=x), but look-aheads or look-behinds should see - outside of the target substring - [20] need insensitive matching for linguistic features other than case; - for example, hiragana to katakana, wide and narrow, simplified Han - to traditional Han (see UTR#30 "Character Foldings") + RL3.9 Possible Match Sets - MISSING + RL3.10 Folded Matching - MISSING [20] + RL3.11 Submatchers - MISSING + + [17] see UAX#10 "Unicode Collation Algorithms" + [18] have Unicode::Collate but not integrated to regexes + [19] have (?<=x) and (?=x), but look-aheads or look-behinds + should see outside of the target substring + [20] need insensitive matching for linguistic features other + than case; for example, hiragana to katakana, wide and + narrow, simplified Han to traditional Han (see UTR#30 + "Character Foldings") =back @@ -1038,27 +1090,26 @@ numbers. To use these numbers, various encodings are needed. UTF-8 -UTF-8 is a variable-length (1 to 6 bytes, current character allocations -require 4 bytes), byte-order independent encoding. For ASCII (and we -really do mean 7-bit ASCII, not another 8-bit encoding), UTF-8 is -transparent. +UTF-8 is a variable-length (1 to 4 bytes), byte-order independent +encoding. For ASCII (and we really do mean 7-bit ASCII, not another +8-bit encoding), UTF-8 is transparent. The following table is from Unicode 3.2. - Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte + Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte - U+0000..U+007F 00..7F + U+0000..U+007F 00..7F U+0080..U+07FF * C2..DF 80..BF - U+0800..U+0FFF E0 * A0..BF 80..BF + U+0800..U+0FFF E0 * A0..BF 80..BF U+1000..U+CFFF E1..EC 80..BF 80..BF U+D000..U+D7FF ED 80..9F 80..BF - U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++ + U+D800..U+DFFF +++++ utf16 surrogates, not legal utf8 +++++ U+E000..U+FFFF EE..EF 80..BF 80..BF - U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF - U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF + U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + U+100000..U+10FFFF F4 80..8F 80..BF 80..BF -Note the gaps before several of the byte entries above marked by '*'. These are +Note the gaps marked by "*" before several of the byte entries above. These are caused by legal UTF-8 avoiding non-shortest encodings: it is technically possible to UTF-8-encode a single code point in different ways, but that is explicitly forbidden, and the shortest possible encoding should always be used @@ -1066,17 +1117,27 @@ explicitly forbidden, and the shortest possible encoding should always be used Another way to look at it is via bits: - Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte + Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte - 0aaaaaaa 0aaaaaaa - 00000bbbbbaaaaaa 110bbbbb 10aaaaaa - ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa - 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa + 0aaaaaaa 0aaaaaaa + 00000bbbbbaaaaaa 110bbbbb 10aaaaaa + ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa + 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa As you can see, the continuation bytes all begin with "10", and the leading bits of the start byte tell how many bytes there are in the encoded character. +The original UTF-8 specification allowed up to 6 bytes, to allow +encoding of numbers up to 0x7FFF_FFFF. Perl continues to allow those, +and has extended that up to 13 bytes to encode code points up to what +can fit in a 64-bit word. However, Perl will warn if you output any of +these as being non-portable; and under strict UTF-8 input protocols, +they are forbidden. + +The Unicode non-character code points are also disallowed in UTF-8 in +"open interchange". See L</Non-character code points>. + =item * UTF-EBCDIC @@ -1090,8 +1151,10 @@ UTF-16, UTF-16BE, UTF-16LE, Surrogates, and BOMs (Byte Order Marks) The followings items are mostly for reference and general Unicode knowledge, Perl doesn't use these constructs internally. -UTF-16 is a 2 or 4 byte encoding. The Unicode code points -C<U+0000..U+FFFF> are stored in a single 16-bit unit, and the code +Like UTF-8, UTF-16 is a variable-width encoding, but where +UTF-8 uses 8-bit code units, UTF-16 uses 16-bit code units. +All code points occupy either 2 or 4 bytes in UTF-16: code points +C<U+0000..U+FFFF> are stored in a single 16-bit unit, and code points C<U+10000..U+10FFFF> in two 16-bit units. The latter case is using I<surrogates>, the first 16-bit unit being the I<high surrogate>, and the second being the I<low surrogate>. @@ -1101,16 +1164,12 @@ range of Unicode code points in pairs of 16-bit units. The I<high surrogates> are the range C<U+D800..U+DBFF> and the I<low surrogates> are the range C<U+DC00..U+DFFF>. The surrogate encoding is - $hi = ($uni - 0x10000) / 0x400 + 0xD800; - $lo = ($uni - 0x10000) % 0x400 + 0xDC00; + $hi = ($uni - 0x10000) / 0x400 + 0xD800; + $lo = ($uni - 0x10000) % 0x400 + 0xDC00; and the decoding is - $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); - -If you try to generate surrogates (for example by using chr()), you -will get a warning, if warnings are turned on, because those code -points are not valid for a Unicode character. + $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); Because of the 16-bitness, UTF-16 is byte-order dependent. UTF-16 itself can be used for in-memory computations, but if storage or @@ -1130,12 +1189,23 @@ you will read the bytes C<0xFF 0xFE>. (And if the originating platform was writing in UTF-8, you will read the bytes C<0xEF 0xBB 0xBF>.) The way this trick works is that the character with the code point -C<U+FFFE> is guaranteed not to be a valid Unicode character, so the +C<U+FFFE> is not supposed to be in input streams, so the sequence of bytes C<0xFF 0xFE> is unambiguously "BOM, represented in little-endian format" and cannot be C<U+FFFE>, represented in big-endian -format". (Actually, C<U+FFFE> is legal for use by your program, even for -input/output, but better not use it if you need a BOM. But it is "illegal for -interchange", so that an unsuspecting program won't get confused.) +format". + +Surrogates have no meaning in Unicode outside their use in pairs to +represent other code points. However, Perl allows them to be +represented individually internally, for example by saying +C<chr(0xD801)>, so that all code points, not just those valid for open +interchange, are +representable. Unicode does define semantics for them, such as their +General Category is "Cs". But because their use is somewhat dangerous, +Perl will warn (using the warning category "surrogate", which is a +sub-category of "utf8") if an attempt is made +to do things like take the lower case of one, or match +case-insensitively, or to output them. (But don't try this on Perls +before 5.14.) =item * @@ -1143,17 +1213,18 @@ UTF-32, UTF-32BE, UTF-32LE The UTF-32 family is pretty much like the UTF-16 family, expect that the units are 32-bit, and therefore the surrogate scheme is not -needed. The BOM signatures will be C<0x00 0x00 0xFE 0xFF> for BE and -C<0xFF 0xFE 0x00 0x00> for LE. +needed. UTF-32 is a fixed-width encoding. The BOM signatures are +C<0x00 0x00 0xFE 0xFF> for BE and C<0xFF 0xFE 0x00 0x00> for LE. =item * UCS-2, UCS-4 -Encodings defined by the ISO 10646 standard. UCS-2 is a 16-bit +Legacy, fixed-width encodings defined by the ISO 10646 standard. UCS-2 is a 16-bit encoding. Unlike UTF-16, UCS-2 is not extensible beyond C<U+FFFF>, because it does not use surrogates. UCS-4 is a 32-bit encoding, -functionally identical to UTF-32. +functionally identical to UTF-32 (the difference being that +UCS-4 forbids neither surrogates nor code points larger than 0x10_FFFF). =item * @@ -1164,6 +1235,36 @@ transport or storage is not eight-bit safe. Defined by RFC 2152. =back +=head2 Non-character code points + +66 code points are set aside in Unicode as "non-character code points". +These all have the Unassigned (Cn) General Category, and they never will +be assigned. These are never supposed to be in legal Unicode input +streams, so that code can use them as sentinels that can be mixed in +with character data, and they always will be distinguishable from that data. +To keep them out of Perl input streams, strict UTF-8 should be +specified, such as by using the layer C<:encoding('UTF-8')>. The +non-character code points are the 32 between U+FDD0 and U+FDEF, and the +34 code points U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, ... U+10FFFE, U+10FFFF. +Some people are under the mistaken impression that these are "illegal", +but that is not true. An application or cooperating set of applications +can legally use them at will internally; but these code points are +"illegal for open interchange". Therefore, Perl will not accept these +from input streams unless lax rules are being used, and will warn +(using the warning category "nonchar", which is a sub-category of "utf8") if +an attempt is made to output them. + +=head2 Beyond Unicode code points + +The maximum Unicode code point is U+10FFFF. But Perl accepts code +points up to the maximum permissible unsigned number available on the +platform. However, Perl will not accept these from input streams unless +lax rules are being used, and will warn (using the warning category +"non_unicode", which is a sub-category of "utf8") if an attempt is made to +operate on or output them. For example, C<uc(0x11_0000)> will generate +this warning, returning the input parameter as its result, as the upper +case of every non-Unicode code point is the code point itself. + =head2 Security Implications of Unicode Read L<Unicode Security Considerations|http://www.unicode.org/reports/tr36>. @@ -1175,7 +1276,7 @@ Also, note the following: Malformed UTF-8 -Unfortunately, the specification of UTF-8 leaves some room for +Unfortunately, the original specification of UTF-8 leaves some room for interpretation of how many bytes of encoded output one should generate from one input Unicode character. Strictly speaking, the shortest possible sequence of UTF-8 bytes should be generated, @@ -1183,27 +1284,16 @@ because otherwise there is potential for an input buffer overflow at the receiving end of a UTF-8 connection. Perl always generates the shortest length UTF-8, and with warnings on, Perl will warn about non-shortest length UTF-8 along with other malformations, such as the -surrogates, which are not real Unicode code points. +surrogates, which are not Unicode code points valid for interchange. =item * -Regular expressions behave slightly differently between byte data and -character (Unicode) data. For example, the "word character" character -class C<\w> will work differently depending on if data is eight-bit bytes -or Unicode. +Regular expression pattern matching may surprise you if you're not +accustomed to Unicode. Starting in Perl 5.14, several pattern +modifiers are available to control this, called the character set +modifiers. Details are given in L<perlre/Character set modifiers>. -In the first case, the set of C<\w> characters is either small--the -default set of alphabetic characters, digits, and the "_"--or, if you -are using a locale (see L<perllocale>), the C<\w> might contain a few -more letters according to your language and country. - -In the second case, the C<\w> set of characters is much, much larger. -Most importantly, even in the set of the first 256 characters, it will -probably match different characters: unlike most locales, which are -specific to a language and country pair, Unicode classifies all the -characters that are letters I<somewhere> as C<\w>. For example, your -locale might not think that LATIN SMALL LETTER ETH is a letter (unless -you happen to speak Icelandic), but Unicode does. +=back As discussed elsewhere, Perl has one foot (two hooves?) planted in each of two worlds: the old world of bytes and the new world of @@ -1212,10 +1302,8 @@ If your legacy code does not explicitly use Unicode, no automatic switch-over to characters should happen. Characters shouldn't get downgraded to bytes, either. It is possible to accidentally mix bytes and characters, however (see L<perluniintro>), in which case C<\w> in -regular expressions might start behaving differently. Review your -code. Use warnings and the C<strict> pragma. - -=back +regular expressions might start behaving differently (unless the C</a> +modifier is in effect). Review your code. Use warnings and the C<strict> pragma. =head2 Unicode in Perl on EBCDIC @@ -1230,45 +1318,27 @@ for more discussion of the issues. =head2 Locales -Usually locale settings and Unicode do not affect each other, but -there are a couple of exceptions: - -=over 4 - -=item * - -You can enable automatic UTF-8-ification of your standard file -handles, default C<open()> layer, and C<@ARGV> by using either -the C<-C> command line switch or the C<PERL_UNICODE> environment -variable, see L<perlrun> for the documentation of the C<-C> switch. - -=item * - -Perl tries really hard to work both with Unicode and the old -byte-oriented world. Most often this is nice, but sometimes Perl's -straddling of the proverbial fence causes problems. - -=back +See L<perllocale/Unicode and UTF-8> =head2 When Unicode Does Not Happen While Perl does have extensive ways to input and output in Unicode, -and few other 'entry points' like the @ARGV which can be interpreted -as Unicode (UTF-8), there still are many places where Unicode (in some -encoding or another) could be given as arguments or received as +and a few other "entry points" like the @ARGV array (which can sometimes be +interpreted as UTF-8), there are still many places where Unicode +(in some encoding or another) could be given as arguments or received as results, or both, but it is not. The following are such interfaces. Also, see L</The "Unicode Bug">. For all of these interfaces Perl currently (as of 5.8.3) simply assumes byte strings both as arguments -and results, or UTF-8 strings if the C<encoding> pragma has been used. +and results, or UTF-8 strings if the (problematic) C<encoding> pragma has been used. -One reason why Perl does not attempt to resolve the role of Unicode in -these cases is that the answers are highly dependent on the operating +One reason that Perl does not attempt to resolve the role of Unicode in +these situations is that the answers are highly dependent on the operating system and the file system(s). For example, whether filenames can be -in Unicode, and in exactly what kind of encoding, is not exactly a -portable concept. Similarly for the qx and system: how well will the -'command line interface' (and which of them?) handle Unicode? +in Unicode and in exactly what kind of encoding, is not exactly a +portable concept. Similarly for C<qx> and C<system>: how well will the +"command-line interface" (and which of them?) handle Unicode? =over 4 @@ -1301,45 +1371,69 @@ readdir, readlink =head2 The "Unicode Bug" -The term, the "Unicode bug" has been applied to an inconsistency with the -Unicode characters whose ordinals are in the Latin-1 Supplement block, that +The term, "Unicode bug" has been applied to an inconsistency +on ASCII platforms with the +Unicode code points in the Latin-1 Supplement block, that is, between 128 and 255. Without a locale specified, unlike all other characters or code points, these characters have very different semantics in -byte semantics versus character semantics. +byte semantics versus character semantics, unless +C<use feature 'unicode_strings'> is specified, directly or indirectly. +(It is indirectly specified by a C<use v5.12> or higher.) -In character semantics they are interpreted as Unicode code points, which means +In character semantics these upper-Latin1 characters are interpreted as +Unicode code points, which means they have the same semantics as Latin-1 (ISO-8859-1). -In byte semantics, they are considered to be unassigned characters, meaning -that the only semantics they have is their ordinal numbers, and that they are +In byte semantics (without C<unicode_strings>), they are considered to +be unassigned characters, meaning that the only semantics they have is +their ordinal numbers, and that they are not members of various character classes. None are considered to match C<\w> -for example, but all match C<\W>. (On EBCDIC platforms, the behavior may -be different from this, depending on the underlying C language library -functions.) +for example, but all match C<\W>. -The behavior is known to have effects on these areas: +Perl 5.12.0 added C<unicode_strings> to force character semantics on +these code points in some circumstances, which fixed portions of the +bug; Perl 5.14.0 fixed almost all of it; and Perl 5.16.0 fixed the +remainder (so far as we know, anyway). The lesson here is to enable +C<unicode_strings> to avoid the headaches described below. + +The old, problematic behavior affects these areas: =over 4 =item * Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>, -and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression -substitutions. +and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in double-quotish +contexts, such as regular expression substitutions. +Under C<unicode_strings> starting in Perl 5.12.0, character semantics are +generally used. See L<perlfunc/lc> for details on how this works +in combination with various other pragmas. =item * -Using caseless (C</i>) regular expression matching +Using caseless (C</i>) regular expression matching. +Starting in Perl 5.14.0, regular expressions compiled within +the scope of C<unicode_strings> use character semantics +even when executed or compiled into larger +regular expressions outside the scope. =item * -Matching a number of properties in regular expressions, such as C<\w> +Matching any of several properties in regular expressions, namely C<\b>, +C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes +I<except> C<[[:ascii:]]>. +Starting in Perl 5.14.0, regular expressions compiled within +the scope of C<unicode_strings> use character semantics +even when executed or compiled into larger +regular expressions outside the scope. =item * -User-defined case change mappings. You can create a C<ToUpper()> function, for -example, which overrides Perl's built-in case mappings. The scalar must be -encoded in utf8 for your function to actually be invoked. +In C<quotemeta> or its inline equivalent C<\Q>, no code points above 127 +are quoted in UTF-8 encoded strings, but in byte encoded strings, code +points between 128-255 are always quoted. +Starting in Perl 5.16.0, consistent quoting rules are used within the +scope of C<unicode_strings>, as described in L<perlfunc/quotemeta>. =back @@ -1349,6 +1443,7 @@ which changes the string's semantics from byte to character or vice versa. As an example, consider the following program and its output: $ perl -le' + no feature 'unicode_strings'; $s1 = "\xC2"; $s2 = "\x{2660}"; for ($s1, $s2, $s1.$s2) { @@ -1367,22 +1462,12 @@ ASCII range (except in a locale), along with Perl's desire to add Unicode support seamlessly. The result wasn't seamless: these characters were orphaned. -Work is being done to correct this, but only some of it was complete in time -for the 5.12 release. What has been finished is the important part of the case -changing component. Due to concerns, and some evidence, that older code might -have come to rely on the existing behavior, the new behavior must be explicitly -enabled by the feature C<unicode_strings> in the L<feature> pragma, even though -no new syntax is involved. - -See L<perlfunc/lc> for details on how this pragma works in combination with -various others for casing. Even though the pragma only affects casing -operations in the 5.12 release, it is planned to have it affect all the -problematic behaviors in later releases: you can't have one without them all. - -In the meantime, a workaround is to always call utf8::upgrade($string), or to -use the standard module L<Encode>. Also, a scalar that has any characters +For Perls earlier than those described above, or when a string is passed +to a function outside the subpragma's scope, a workaround is to always +call C<utf8::upgrade($string)>, +or to use the standard module L<Encode>. Also, a scalar that has any characters whose ordinal is above 0x100, or which were specified using either of the -C<\N{...}> notations will automatically have character semantics. +C<\N{...}> notations, will automatically have character semantics. =head2 Forcing Unicode in Perl (Or Unforcing Unicode in Perl) @@ -1429,7 +1514,8 @@ pointing after the UTF-8 bytes. It works appropriately on EBCDIC machines. =item * -C<utf8_to_uvchr(buf, lenp)> reads UTF-8 encoded bytes from a buffer and +C<utf8_to_uvchr_buf(buf, bufend, lenp)> reads UTF-8 encoded bytes from a +buffer and returns the Unicode character code point and, optionally, the length of the UTF-8 byte sequence. It works appropriately on EBCDIC machines. @@ -1453,13 +1539,14 @@ designed to be a one-way street). =item * -C<is_utf8_char(s)> returns true if the pointer points to a valid UTF-8 -character. +C<is_utf8_string(buf, len)> returns true if C<len> bytes of the buffer +are valid UTF-8. =item * -C<is_utf8_string(buf, len)> returns true if C<len> bytes of the buffer -are valid UTF-8. +C<is_utf8_char(s)> returns true if the pointer points to a valid UTF-8 +character. However, this function should not be used because of +security concerns. Instead, use C<is_utf8_string()>. =item * @@ -1495,9 +1582,10 @@ output more readable. =item * -C<ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2)> can be used to +C<foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2)> can be used to compare two strings case-insensitively in Unicode. For case-sensitive -comparisons you can just use C<memEQ()> and C<memNE()> as usual. +comparisons you can just use C<memEQ()> and C<memNE()> as usual, except +if one string is in utf8 and the other isn't. =back @@ -1509,47 +1597,27 @@ in the Perl source code distribution. Perl by default comes with the latest supported Unicode version built in, but you can change to use any earlier one. -Download the files in the version of Unicode that you want from the Unicode web +Download the files in the desired version of Unicode from the Unicode web site L<http://www.unicode.org>). These should replace the existing files in -C<\$Config{privlib}>/F<unicore>. (C<\%Config> is available from the Config -module.) Follow the instructions in F<README.perl> in that directory to change -some of their names, and then run F<make>. - -It is even possible to download them to a different directory, and then change -F<utf8_heavy.pl> in the directory C<\$Config{privlib}> to point to the new -directory, or maybe make a copy of that directory before making the change, and -using C<@INC> or the C<-I> run-time flag to switch between versions at will -(but because of caching, not in the middle of a process), but all this is -beyond the scope of these instructions. +F<lib/unicore> in the Perl source tree. Follow the instructions in +F<README.perl> in that directory to change some of their names, and then build +perl (see L<INSTALL>). =head1 BUGS =head2 Interaction with Locales -Use of locales with Unicode data may lead to odd results. Currently, -Perl attempts to attach 8-bit locale info to characters in the range -0..255, but this technique is demonstrably incorrect for locales that -use characters above that range when mapped into Unicode. Perl's -Unicode support will also tend to run slower. Use of locales with -Unicode is discouraged. +See L<perllocale/Unicode and UTF-8> =head2 Problems with characters in the Latin-1 Supplement range See L</The "Unicode Bug"> -=head2 Problems with case-insensitive regular expression matching - -There are problems with case-insensitive matches, including those involving -character classes (enclosed in [square brackets]), characters whose fold -is to multiple characters (such as the single character LATIN SMALL LIGATURE -FFL matches case-insensitively with the 3-character string C<ffl>), and -characters in the Latin-1 Supplement. - =head2 Interaction with Extensions When Perl exchanges data with an extension, the extension should be able to understand the UTF8 flag and act accordingly. If the -extension doesn't know about the flag, it's likely that the extension +extension doesn't recognize that flag, it's likely that the extension will return incorrectly-flagged data. So if you're working with Unicode data, consult the documentation of @@ -1573,13 +1641,14 @@ would convert the argument to raw UTF-8 and convert the result back to Perl's internal representation like so: sub my_escape_html ($) { - my($what) = shift; - return unless defined $what; - Encode::decode_utf8(Foo::Bar::escape_html(Encode::encode_utf8($what))); + my($what) = shift; + return unless defined $what; + Encode::decode_utf8(Foo::Bar::escape_html( + Encode::encode_utf8($what))); } Sometimes, when the extension does not convert data but just stores -and retrieves them, you will be in a position to use the otherwise +and retrieves them, you will be able to use the otherwise dangerous Encode::_utf8_on() function. Let's say the popular C<Foo::Bar> extension, written in C, provides a C<param> method that lets you store and retrieve data according to these prototypes: @@ -1622,12 +1691,12 @@ somewhat less spectacular, at least for some operations. In general, operations with UTF-8 encoded strings are still slower. As an example, the Unicode properties (character classes) like C<\p{Nd}> are known to be quite a bit slower (5-20 times) than their simpler counterparts -like C<\d> (then again, there 268 Unicode characters matching C<Nd> +like C<\d> (then again, there are hundreds of Unicode characters matching C<Nd> compared with the 10 ASCII characters matching C<d>). =head2 Problems on EBCDIC platforms -There are a number of known problems with Perl on EBCDIC platforms. If you +There are several known problems with Perl on EBCDIC platforms. If you want to use Perl there, send email to perlbug@perl.org. In earlier versions, when byte and character data were concatenated, @@ -1647,7 +1716,7 @@ working with 5.6, you will need some of the following adjustments to your code. The examples are written such that the code will continue to work under 5.6, so you should be safe to try them out. -=over 4 +=over 3 =item * @@ -1706,7 +1775,8 @@ to deal with UTF-8 data. Please check the documentation to verify if that is still true. sub fetchrow { - my($self, $sth, $what) = @_; # $what is one of fetchrow_{array,hashref} + # $what is one of fetchrow_{array,hashref} + my($self, $sth, $what) = @_; if ($] < 5.007) { return $sth->$what; } else { @@ -1721,7 +1791,9 @@ that is still true. my $ret = $sth->$what; if (ref $ret) { for my $k (keys %$ret) { - defined && /[^\000-\177]/ && Encode::_utf8_on($_) for $ret->{$k}; + defined + && /[^\000-\177]/ + && Encode::_utf8_on($_) for $ret->{$k}; } return $ret; } else { diff --git a/gnu/usr.bin/perl/pod/perlunifaq.pod b/gnu/usr.bin/perl/pod/perlunifaq.pod index ab42ff194a0..9bd103c9ac2 100644 --- a/gnu/usr.bin/perl/pod/perlunifaq.pod +++ b/gnu/usr.bin/perl/pod/perlunifaq.pod @@ -84,12 +84,12 @@ or encode anymore, on things that use the layered handle. You can provide this layer when C<open>ing the file: - open my $fh, '>:encoding(UTF-8)', $filename; # auto encoding on write - open my $fh, '<:encoding(UTF-8)', $filename; # auto decoding on read + open my $fh, '>:encoding(UTF-8)', $filename; # auto encoding on write + open my $fh, '<:encoding(UTF-8)', $filename; # auto decoding on read Or if you already have an open filehandle: - binmode $fh, ':encoding(UTF-8)'; + binmode $fh, ':encoding(UTF-8)'; Some database drivers for DBI can also automatically encode and decode, but that is sometimes limited to the UTF-8 encoding. @@ -138,28 +138,27 @@ concern, and you can just C<eval> dumped data as always. =head2 Why do some characters not uppercase or lowercase correctly? -It seemed like a good idea at the time, to keep the semantics the same for -standard strings, when Perl got Unicode support. The plan is to fix this -in the future, and the casing component has in fact mostly been fixed, but we -have to deal with the fact that Perl treats equal strings differently, -depending on the internal state. - -First the casing. Just put a C<use feature 'unicode_strings'> near the -beginning of your program. Within its lexical scope, C<uc>, C<lc>, C<ucfirst>, -C<lcfirst>, and the regular expression escapes C<\U>, C<\L>, C<\u>, C<\l> use -Unicode semantics for changing case regardless of whether the UTF8 flag is on -or not. However, if you pass strings to subroutines in modules outside the -pragma's scope, they currently likely won't behave this way, and you have to -try one of the solutions below. There is another exception as well: if you -have furnished your own casing functions to override the default, these will -not be called unless the UTF8 flag is on) - -This remains a problem for the regular expression constructs -C<\d>, C<\s>, C<\w>, C<\D>, C<\S>, C<\W>, C</.../i>, C<(?i:...)>, -and C</[[:posix:]]/>. - -To force Unicode semantics, you can upgrade the internal representation to -by doing C<utf8::upgrade($string)>. This can be used +Starting in Perl 5.14 (and partially in Perl 5.12), just put a +C<use feature 'unicode_strings'> near the beginning of your program. +Within its lexical scope you shouldn't have this problem. It also is +automatically enabled under C<use feature ':5.12'> or using C<-E> on the +command line for Perl 5.12 or higher. + +The rationale for requiring this is to not break older programs that +rely on the way things worked before Unicode came along. Those older +programs knew only about the ASCII character set, and so may not work +properly for additional characters. When a string is encoded in UTF-8, +Perl assumes that the program is prepared to deal with Unicode, but when +the string isn't, Perl assumes that only ASCII (unless it is an EBCDIC +platform) is wanted, and so those characters that are not ASCII +characters aren't recognized as to what they would be in Unicode. +C<use feature 'unicode_strings'> tells Perl to treat all characters as +Unicode, whether the string is encoded in UTF-8 or not, thus avoiding +the problem. + +However, on earlier Perls, or if you pass strings to subroutines outside +the feature's scope, you can force Unicode semantics by changing the +encoding to UTF-8 by doing C<utf8::upgrade($string)>. This can be used safely on any string, as it checks and does not change strings that have already been upgraded. @@ -275,7 +274,8 @@ Instead of C<decode> and C<encode>, you could use C<_utf8_on> and C<_utf8_off>, but this is considered bad style. Especially C<_utf8_on> can be dangerous, for the same reason that C<:utf8> can. -There are some shortcuts for oneliners; see C<-C> in L<perlrun>. +There are some shortcuts for oneliners; +see L<-C|perlrun/-C [numberE<sol>list]> in L<perlrun>. =head2 What's the difference between C<UTF-8> and C<utf8>? diff --git a/gnu/usr.bin/perl/pod/perluniintro.pod b/gnu/usr.bin/perl/pod/perluniintro.pod index 6c82efde159..8ce4b7b4464 100644 --- a/gnu/usr.bin/perl/pod/perluniintro.pod +++ b/gnu/usr.bin/perl/pod/perluniintro.pod @@ -5,21 +5,22 @@ perluniintro - Perl Unicode introduction =head1 DESCRIPTION This document gives a general idea of Unicode and how to use Unicode -in Perl. +in Perl. See L</Further Resources> for references to more in-depth +treatments of Unicode. =head2 Unicode Unicode is a character set standard which plans to codify all of the writing systems of the world, plus many other symbols. -Unicode and ISO/IEC 10646 are coordinated standards that provide code -points for characters in almost all modern character set standards, -covering more than 30 writing systems and hundreds of languages, +Unicode and ISO/IEC 10646 are coordinated standards that unify +almost all other modern character set standards, +covering more than 80 writing systems and hundreds of languages, including all commercially-important modern languages. All characters in the largest Chinese, Japanese, and Korean dictionaries are also encoded. The standards will eventually cover almost all characters in more than 250 writing systems and thousands of languages. -Unicode 1.0 was released in October 1991, and 4.0 in April 2003. +Unicode 1.0 was released in October 1991, and 6.0 in October 2010. A Unicode I<character> is an abstract entity. It is not bound to any particular integer width, especially not to the C language C<char>. @@ -31,7 +32,9 @@ those characters. Unicode defines characters like C<LATIN CAPITAL LETTER A> or C<GREEK SMALL LETTER ALPHA> and unique numbers for the characters, in this case 0x0041 and 0x03B1, respectively. These unique numbers are called -I<code points>. +I<code points>. A code point is essentially the position of the +character within the set of all possible Unicode characters, and thus in +Perl, the term I<ordinal> is often used interchangeably with it. The Unicode standard prefers using hexadecimal notation for the code points. If numbers like C<0x0041> are unfamiliar to you, take a peek @@ -51,62 +54,81 @@ modelled by a I<base character> (like C<LATIN CAPITAL LETTER A>) followed by one or more I<modifiers> (like C<COMBINING ACUTE ACCENT>). This sequence of base character and modifiers is called a I<combining character sequence>. Some non-western languages require more complicated -models, so Unicode created the I<grapheme cluster> concept, and then the -I<extended grapheme cluster>. For example, a Korean Hangul syllable is -considered a single logical character, but most often consists of three actual +models, so Unicode created the I<grapheme cluster> concept, which was +later further refined into the I<extended grapheme cluster>. For +example, a Korean Hangul syllable is considered a single logical +character, but most often consists of three actual Unicode characters: a leading consonant followed by an interior vowel followed by a trailing consonant. Whether to call these extended grapheme clusters "characters" depends on your point of view. If you are a programmer, you probably would tend towards seeing -each element in the sequences as one unit, or "character". The whole sequence -could be seen as one "character", however, from the user's point of view, since -that's probably what it looks like in the context of the user's language. - -With this "whole sequence" view of characters, the total number of -characters is open-ended. But in the programmer's "one unit is one -character" point of view, the concept of "characters" is more -deterministic. In this document, we take that second point of view: -one "character" is one Unicode code point. - -For some combinations, there are I<precomposed> characters. -C<LATIN CAPITAL LETTER A WITH ACUTE>, for example, is defined as -a single code point. These precomposed characters are, however, -only available for some combinations, and are mainly -meant to support round-trip conversions between Unicode and legacy -standards (like the ISO 8859). In the general case, the composing -method is more extensible. To support conversion between -different compositions of the characters, various I<normalization -forms> to standardize representations are also defined. +each element in the sequences as one unit, or "character". However from +the user's point of view, the whole sequence could be seen as one +"character" since that's probably what it looks like in the context of the +user's language. In this document, we take the programmer's point of +view: one "character" is one Unicode code point. + +For some combinations of base character and modifiers, there are +I<precomposed> characters. There is a single character equivalent, for +example, to the sequence C<LATIN CAPITAL LETTER A> followed by +C<COMBINING ACUTE ACCENT>. It is called C<LATIN CAPITAL LETTER A WITH +ACUTE>. These precomposed characters are, however, only available for +some combinations, and are mainly meant to support round-trip +conversions between Unicode and legacy standards (like ISO 8859). Using +sequences, as Unicode does, allows for needing fewer basic building blocks +(code points) to express many more potential grapheme clusters. To +support conversion between equivalent forms, various I<normalization +forms> are also defined. Thus, C<LATIN CAPITAL LETTER A WITH ACUTE> is +in I<Normalization Form Composed>, (abbreviated NFC), and the sequence +C<LATIN CAPITAL LETTER A> followed by C<COMBINING ACUTE ACCENT> +represents the same character in I<Normalization Form Decomposed> (NFD). Because of backward compatibility with legacy encodings, the "a unique number for every character" idea breaks down a bit: instead, there is "at least one number for every character". The same character could be represented differently in several legacy encodings. The -converse is also not true: some code points do not have an assigned +converse is not also true: some code points do not have an assigned character. Firstly, there are unallocated code points within otherwise used blocks. Secondly, there are special Unicode control characters that do not represent true characters. -A common myth about Unicode is that it is "16-bit", that is, -Unicode is only represented as C<0x10000> (or 65536) characters from -C<0x0000> to C<0xFFFF>. B<This is untrue.> Since Unicode 2.0 (July +When Unicode was first conceived, it was thought that all the world's +characters could be represented using a 16-bit word; that is a maximum of +C<0x10000> (or 65536) characters from C<0x0000> to C<0xFFFF> would be +needed. This soon proved to be false, and since Unicode 2.0 (July 1996), Unicode has been defined all the way up to 21 bits (C<0x10FFFF>), -and since Unicode 3.1 (March 2001), characters have been defined -beyond C<0xFFFF>. The first C<0x10000> characters are called the -I<Plane 0>, or the I<Basic Multilingual Plane> (BMP). With Unicode -3.1, 17 (yes, seventeen) planes in all were defined--but they are -nowhere near full of defined characters, yet. - -Another myth is about Unicode blocks--that they have something to -do with languages--that each block would define the characters used -by a language or a set of languages. B<This is also untrue.> +and Unicode 3.1 (March 2001) defined the first characters above C<0xFFFF>. +The first C<0x10000> characters are called the I<Plane 0>, or the +I<Basic Multilingual Plane> (BMP). With Unicode 3.1, 17 (yes, +seventeen) planes in all were defined--but they are nowhere near full of +defined characters, yet. + +When a new language is being encoded, Unicode generally will choose a +C<block> of consecutive unallocated code points for its characters. So +far, the number of code points in these blocks has always been evenly +divisible by 16. Extras in a block, not currently needed, are left +unallocated, for future growth. But there have been occasions when +a later relase needed more code points than the available extras, and a +new block had to allocated somewhere else, not contiguous to the initial +one, to handle the overflow. Thus, it became apparent early on that +"block" wasn't an adequate organizing principal, and so the C<Script> +property was created. (Later an improved script property was added as +well, the C<Script_Extensions> property.) Those code points that are in +overflow blocks can still +have the same script as the original ones. The script concept fits more +closely with natural language: there is C<Latin> script, C<Greek> +script, and so on; and there are several artificial scripts, like +C<Common> for characters that are used in multiple scripts, such as +mathematical symbols. Scripts usually span varied parts of several +blocks. For more information about scripts, see L<perlunicode/Scripts>. The division into blocks exists, but it is almost completely -accidental--an artifact of how the characters have been and -still are allocated. Instead, there is a concept called I<scripts>, which is -more useful: there is C<Latin> script, C<Greek> script, and so on. Scripts -usually span varied parts of several blocks. For more information about -scripts, see L<perlunicode/Scripts>. +accidental--an artifact of how the characters have been and still are +allocated. (Note that this paragraph has oversimplified things for the +sake of this being an introduction. Unicode doesn't really encode +languages, but the writing systems for them--their scripts; and one +script can be used by many languages. Unicode also encodes things that +aren't really about languages, such as symbols like C<BAGGAGE CLAIM>.) The Unicode code points are just abstract numbers. To input and output these abstract numbers, the numbers must be I<encoded> or @@ -128,24 +150,37 @@ natively. Perl 5.8.0, however, is the first recommended release for serious Unicode work. The maintenance release 5.6.1 fixed many of the problems of the initial Unicode implementation, but for example regular expressions still do not work with Unicode in 5.6.1. - -B<Starting from Perl 5.8.0, the use of C<use utf8> is needed only in much more restricted circumstances.> In earlier releases the C<utf8> pragma was used to declare +Perl 5.14.0 is the first release where Unicode support is +(almost) seamlessly integrable without some gotchas (the exception being +some differences in L<quotemeta|perlfunc/quotemeta>, which is fixed +starting in Perl 5.16.0). To enable this +seamless support, you should C<use feature 'unicode_strings'> (which is +automatically selected if you C<use 5.012> or higher). See L<feature>. +(5.14 also fixes a number of bugs and departures from the Unicode +standard.) + +Before Perl 5.8.0, the use of C<use utf8> was used to declare that operations in the current block or file would be Unicode-aware. This model was found to be wrong, or at least clumsy: the "Unicodeness" is now carried with the data, instead of being attached to the -operations. Only one case remains where an explicit C<use utf8> is -needed: if your Perl script itself is encoded in UTF-8, you can use -UTF-8 in your identifier names, and in string and regular expression +operations. +Starting with Perl 5.8.0, only one case remains where an explicit C<use +utf8> is needed: if your Perl script itself is encoded in UTF-8, you can +use UTF-8 in your identifier names, and in string and regular expression literals, by saying C<use utf8>. This is not the default because scripts with legacy 8-bit data in them would break. See L<utf8>. =head2 Perl's Unicode Model Perl supports both pre-5.6 strings of eight-bit native bytes, and -strings of Unicode characters. The principle is that Perl tries to -keep its data as eight-bit bytes for as long as possible, but as soon -as Unicodeness cannot be avoided, the data is (mostly) transparently upgraded -to Unicode. There are some problems--see L<perlunicode/The "Unicode Bug">. +strings of Unicode characters. The general principle is that Perl tries +to keep its data as eight-bit bytes for as long as possible, but as soon +as Unicodeness cannot be avoided, the data is transparently upgraded +to Unicode. Prior to Perl 5.14, the upgrade was not completely +transparent (see L<perlunicode/The "Unicode Bug">), and for backwards +compatibility, full transparency is not gained unless C<use feature +'unicode_strings'> (see L<feature>) or C<use 5.012> (or higher) is +selected. Internally, Perl currently uses either whatever the native eight-bit character set of the platform (for example Latin-1) is, defaulting to @@ -182,10 +217,11 @@ handles, default C<open()> layer, and C<@ARGV> by using either the C<-C> command line switch or the C<PERL_UNICODE> environment variable, see L<perlrun> for the documentation of the C<-C> switch. -Note that this means that Perl expects other software to work, too: +Note that this means that Perl expects other software to work the same +way: if Perl has been led to believe that STDIN should be UTF-8, but then -STDIN coming in from another command is not UTF-8, Perl will complain -about the malformed UTF-8. +STDIN coming in from another command is not UTF-8, Perl will likely +complain about the malformed UTF-8. All features that combine Unicode and I/O also require using the new PerlIO feature. Almost all Perl 5.8 platforms do use PerlIO, though: @@ -235,10 +271,9 @@ always produced. If you want to force the production of Unicode characters regardless of the numeric value, use C<pack("U", ...)> instead of C<\x..>, C<\x{...}>, or C<chr()>. -You can also use the C<charnames> pragma to invoke characters +You can invoke characters by name in double-quoted strings: - use charnames ':full'; my $arabic_alef = "\N{ARABIC LETTER ALEF}"; And, as mentioned above, you can also C<pack()> numbers into Unicode @@ -248,7 +283,7 @@ characters: Note that both C<\x{...}> and C<\N{...}> are compile-time string constants: you cannot use variables in them. if you want similar -run-time functionality, use C<chr()> and C<charnames::vianame()>. +run-time functionality, use C<chr()> and C<charnames::string_vianame()>. If you want to force the result to Unicode characters, use the special C<"U0"> prefix. It consumes no arguments but causes the following bytes @@ -269,20 +304,22 @@ will work on the Unicode characters (see L<perlunicode> and L<perlretut>). Note that Perl considers grapheme clusters to be separate characters, so for example - use charnames ':full'; - print length("\N{LATIN CAPITAL LETTER A}\N{COMBINING ACUTE ACCENT}"), "\n"; + print length("\N{LATIN CAPITAL LETTER A}\N{COMBINING ACUTE ACCENT}"), + "\n"; will print 2, not 1. The only exception is that regular expressions -have C<\X> for matching an extended grapheme cluster. +have C<\X> for matching an extended grapheme cluster. (Thus C<\X> in a +regular expression would match the entire sequence of both the example +characters.) Life is not quite so transparent, however, when working with legacy encodings, I/O, and certain special cases: =head2 Legacy Encodings -When you combine legacy data and Unicode the legacy data needs -to be upgraded to Unicode. Normally ISO 8859-1 (or EBCDIC, if -applicable) is assumed. +When you combine legacy data and Unicode, the legacy data needs +to be upgraded to Unicode. Normally the legacy data is assumed to be +ISO 8859-1 (or EBCDIC, if applicable). The C<Encode> module knows about many encodings and has interfaces for doing conversions between those encodings: @@ -321,9 +358,10 @@ and on already open streams, use C<binmode()>: The matching of encoding names is loose: case does not matter, and many encodings have several aliases. Note that the C<:utf8> layer must always be specified exactly like that; it is I<not> subject to -the loose matching of encoding names. Also note that C<:utf8> is unsafe for +the loose matching of encoding names. Also note that currently C<:utf8> is unsafe for input, because it accepts the data without validating that it is indeed valid -UTF8. +UTF-8; you should instead use C<:encoding(utf-8)> (with or without a +hyphen). See L<PerlIO> for the C<:utf8> layer, L<PerlIO::encoding> and L<Encode::PerlIO> for the C<:encoding()> layer, and @@ -344,7 +382,8 @@ layer when opening files The I/O layers can also be specified more flexibly with the C<open> pragma. See L<open>, or look at the following example. - use open ':encoding(utf8)'; # input/output default encoding will be UTF-8 + use open ':encoding(utf8)'; # input/output default encoding will be + # UTF-8 open X, ">file"; print X chr(0x100), "\n"; close X; @@ -355,7 +394,8 @@ the C<open> pragma. See L<open>, or look at the following example. With the C<open> pragma you can use the C<:locale> layer BEGIN { $ENV{LC_ALL} = $ENV{LANG} = 'ru_RU.KOI8-R' } - # the :locale will probe the locale environment variables like LC_ALL + # the :locale will probe the locale environment variables like + # LC_ALL use open OUT => ':locale'; # russki parusski open(O, ">koi8"); print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1 @@ -432,13 +472,13 @@ its argument so that Unicode characters with code points greater than 255 are displayed as C<\x{...}>, control characters (like C<\n>) are displayed as C<\x..>, and the rest of the characters as themselves: - sub nice_string { - join("", - map { $_ > 255 ? # if wide character... - sprintf("\\x{%04X}", $_) : # \x{...} - chr($_) =~ /[[:cntrl:]]/ ? # else if control character ... - sprintf("\\x%02X", $_) : # \x.. - quotemeta(chr($_)) # else quoted or as themselves + sub nice_string { + join("", + map { $_ > 255 ? # if wide character... + sprintf("\\x{%04X}", $_) : # \x{...} + chr($_) =~ /[[:cntrl:]]/ ? # else if control character... + sprintf("\\x%02X", $_) : # \x.. + quotemeta(chr($_)) # else quoted or as themselves } unpack("W*", $_[0])); # unpack Unicode characters } @@ -513,7 +553,7 @@ C<LATIN CAPITAL LETTER A>?) The short answer is that by default Perl compares equivalence (C<eq>, C<ne>) based only on code points of the characters. In the above case, the answer is no (because 0x00C1 != 0x0041). But sometimes, any -CAPITAL LETTER As should be considered equal, or even As of any case. +CAPITAL LETTER A's should be considered equal, or even A's of any case. The long answer is that you need to consider character normalization and casing issues: see L<Unicode::Normalize>, Unicode Technical Report #15, @@ -521,7 +561,8 @@ L<Unicode Normalization Forms|http://www.unicode.org/unicode/reports/tr15> and sections on case mapping in the L<Unicode Standard|http://www.unicode.org>. As of Perl 5.8.0, the "Full" case-folding of I<Case -Mappings/SpecialCasing> is implemented, but bugs remain in C<qr//i> with them. +Mappings/SpecialCasing> is implemented, but bugs remain in C<qr//i> with them, +mostly fixed by 5.14. =item * @@ -553,19 +594,19 @@ L<http://www.unicode.org/unicode/reports/tr10/> Character Ranges and Classes -Character ranges in regular expression character classes (C</[a-z]/>) -and in the C<tr///> (also known as C<y///>) operator are not magically -Unicode-aware. What this means is that C<[A-Za-z]> will not magically start -to mean "all alphabetic letters"; not that it does mean that even for -8-bit characters, you should be using C</[[:alpha:]]/> in that case. +Character ranges in regular expression bracketed character classes ( e.g., +C</[a-z]/>) and in the C<tr///> (also known as C<y///>) operator are not +magically Unicode-aware. What this means is that C<[A-Za-z]> will not +magically start to mean "all alphabetic letters" (not that it does mean that +even for 8-bit characters; for those, if you are using locales (L<perllocale>), +use C</[[:alpha:]]/>; and if not, use the 8-bit-aware property C<\p{alpha}>). + +All the properties that begin with C<\p> (and its inverse C<\P>) are actually +character classes that are Unicode-aware. There are dozens of them, see +L<perluniprops>. -For specifying character classes like that in regular expressions, -you can use the various Unicode properties--C<\pL>, or perhaps -C<\p{Alphabetic}>, in this particular case. You can use Unicode -code points as the end points of character ranges, but there is no -magic associated with specifying a certain range. For further -information--there are dozens of Unicode character classes--see -L<perlunicode>. +You can use Unicode code points as the end points of character ranges, and the +range will include all Unicode code points that lie between those end points. =item * @@ -575,6 +616,8 @@ Unicode does define several other decimal--and numeric--characters besides the familiar 0 to 9, such as the Arabic and Indic digits. Perl does not support string-to-number conversion for digits other than ASCII 0 to 9 (and ASCII a to f for hexadecimal). +To get safe conversions from any Unicode string, use +L<Unicode::UCD/num()>. =back @@ -601,13 +644,18 @@ How Do I Make My Scripts Work With Unicode? Very little work should be needed since nothing changes until you generate Unicode data. The most important thing is getting input as Unicode; for that, see the earlier I/O discussion. +To get full seamless Unicode support, add +C<use feature 'unicode_strings'> (or C<use 5.012> or higher) to your +script. =item * How Do I Know Whether My String Is In Unicode? -You shouldn't have to care. But you may, because currently the semantics of the -characters whose ordinals are in the range 128 to 255 is different depending on +You shouldn't have to care. But you may if your Perl is before 5.14.0 +or you haven't specified C<use feature 'unicode_strings'> or C<use +5.012> (or higher) because otherwise the semantics of the code points +in the range 128 to 255 are different depending on whether the string they are contained within is in Unicode or not. (See L<perlunicode/When Unicode Does Not Happen>.) @@ -622,10 +670,10 @@ string has any characters at all. All the C<is_utf8()> does is to return the value of the internal "utf8ness" flag attached to the C<$string>. If the flag is off, the bytes in the scalar are interpreted as a single byte encoding. If the flag is on, the bytes in the scalar -are interpreted as the (multi-byte, variable-length) UTF-8 encoded code -points of the characters. Bytes added to a UTF-8 encoded string are +are interpreted as the (variable-length, potentially multi-byte) UTF-8 encoded +code points of the characters. Bytes added to a UTF-8 encoded string are automatically upgraded to UTF-8. If mixed non-UTF-8 and UTF-8 scalars -are merged (double-quoted interpolation, explicit concatenation, and +are merged (double-quoted interpolation, explicit concatenation, or printf/sprintf parameter substitution), the result will be UTF-8 encoded as if copies of the byte strings were upgraded to UTF-8: for example, @@ -638,16 +686,23 @@ C<$a> will stay byte-encoded. Sometimes you might really need to know the byte length of a string instead of the character length. For that use either the -C<Encode::encode_utf8()> function or the C<bytes> pragma and -the C<length()> function: +C<Encode::encode_utf8()> function or the C<bytes> pragma +and the C<length()> function: my $unicode = chr(0x100); print length($unicode), "\n"; # will print 1 require Encode; - print length(Encode::encode_utf8($unicode)), "\n"; # will print 2 + print length(Encode::encode_utf8($unicode)),"\n"; # will print 2 use bytes; print length($unicode), "\n"; # will also print 2 # (the 0xC4 0x80 of the UTF-8) + no bytes; + +=item * + +How Do I Find Out What Encoding a File Has? + +You might try L<Encode::Guess>, but it has a number of limitations. =item * @@ -730,11 +785,11 @@ or: You can find the bytes that make up a UTF-8 sequence with - @bytes = unpack("C*", $Unicode_string) + @bytes = unpack("C*", $Unicode_string) and you can create well-formed Unicode with - $Unicode_string = pack("U*", 0xff, ...) + $Unicode_string = pack("U*", 0xff, ...) =item * @@ -747,11 +802,19 @@ L<http://www.cl.cam.ac.uk/~mgk25/unicode.html> How Does Unicode Work With Traditional Locales? -In Perl, not very well. Avoid using locales through the C<locale> -pragma. Use only one or the other. But see L<perlrun> for the -description of the C<-C> switch and its environment counterpart, -C<$ENV{PERL_UNICODE}> to see how to enable various Unicode features, -for example by using locale settings. +Starting in Perl 5.16, you can specify + + use locale ':not_characters'; + +to get Perl to work well with tradtional locales. The catch is that you +have to translate from the locale character set to/from Unicode +yourself. See L</Unicode IE<sol>O> above for how to + + use open ':locale'; + +to accomplish this, but full details are in L<perllocale/Unicode and +UTF-8>, including gotchas that happen if you don't specifiy +C<:not_characters>. =back @@ -809,6 +872,14 @@ L<http://www.unicode.org/glossary/> =item * +Unicode Recommended Reading List + +The Unicode Consortium has a list of articles and books, some of which +give a much more in depth treatment of Unicode: +L<http://unicode.org/resources/readinglist.html> + +=item * + Unicode Useful Resources L<http://www.unicode.org/unicode/onlinedat/resources.html> @@ -834,22 +905,6 @@ L<http://www.eki.ee/letter/> =item * -The Unicode support files live within the Perl installation in the -directory - - $Config{installprivlib}/unicore - -in Perl 5.8.0 or newer, and - - $Config{installprivlib}/unicode - -in the Perl 5.6 series. (The renaming to F<lib/unicore> was done to -avoid naming conflicts with lib/Unicode in case-insensitive filesystems.) -The main Unicode data file is F<UnicodeData.txt> (or F<Unicode.301> in -Perl 5.6.1.) You can find the C<$Config{installprivlib}> by - - perl "-V:installprivlib" - You can explore various information from the Unicode data files using the C<Unicode::UCD> module. @@ -886,6 +941,6 @@ mailing lists for their valuable feedback. =head1 AUTHOR, COPYRIGHT, AND LICENSE -Copyright 2001-2002 Jarkko Hietaniemi E<lt>jhi@iki.fiE<gt> +Copyright 2001-2011 Jarkko Hietaniemi E<lt>jhi@iki.fiE<gt> This document may be distributed under the same terms as Perl itself. diff --git a/gnu/usr.bin/perl/pod/perlunitut.pod b/gnu/usr.bin/perl/pod/perlunitut.pod index fc352d5aad1..9e5af04ec79 100644 --- a/gnu/usr.bin/perl/pod/perlunitut.pod +++ b/gnu/usr.bin/perl/pod/perlunitut.pod @@ -42,8 +42,8 @@ distinction between code point and character is blurred, so the terms often are used interchangeably.) There are many, many code points, but computers work with bytes, and a byte has -room for only 256 values. Unicode has many more characters, so you need a -method to make these accessible. +room for only 256 values. Unicode has many more characters than that, +so you need a method to make these accessible. Unicode is encoded using several competing encodings, of which UTF-8 is the most used. In a Unicode encoding, multiple subsequent bytes can be used to diff --git a/gnu/usr.bin/perl/pod/perlutil.pod b/gnu/usr.bin/perl/pod/perlutil.pod index 453248d2497..040f51d5f65 100644 --- a/gnu/usr.bin/perl/pod/perlutil.pod +++ b/gnu/usr.bin/perl/pod/perlutil.pod @@ -66,7 +66,7 @@ utility will look for errors in your markup. F<splain> is an interface to L<perldiag> - paste in your error message to it, and it'll explain it for you. -=item L<roffitall|roffitall> +=item C<roffitall> The C<roffitall> utility is not installed on your system but lives in the F<pod/> directory of your Perl source kit; it converts all the @@ -75,7 +75,7 @@ typeset PostScript or text file of the whole lot. =back -=head2 Convertors +=head2 Converters To help you convert legacy programs to Perl, we've included three conversion filters: @@ -164,7 +164,7 @@ itself or any of the standard library modules back to the developers; please read through the documentation for F<perlbug> thoroughly before using it to submit a bug report. -=item L<perlthanks|perlthanks> +=item L<perlthanks|perlbug> This program provides an easy way to send a thank-you message back to the authors and maintainers of perl. It's just F<perlbug> installed under @@ -209,13 +209,6 @@ F<xsubpp> will compile XS code into C code by embedding the constructs necessary to let C functions manipulate Perl values and creates the glue necessary to let Perl access those functions. -=item L<dprofpp|dprofpp> - -Perl comes with a profiler, the F<Devel::DProf> module. The -F<dprofpp> utility analyzes the output of this profiler and tells you -which subroutines are taking up the most run time. See L<Devel::DProf> -for more information. - =item L<prove> F<prove> is a command-line interface to the test-running functionality @@ -253,11 +246,21 @@ archive and an unextracted one. (Note that this utility requires the C<Text::Diff> module to function properly; this module isn't distributed with perl, but is available from the CPAN.) +=item L<ptargrep> + +F<ptargrep> is a utility to apply pattern matching to the contents of files +in a tar archive. + =item L<shasum> This utility, that comes with the C<Digest::SHA> module, is used to print or verify SHA checksums. +=item L<zipdetails> + +L<zipdetails> displays information about the internal record structure of the zip file. +It is not concerned with displaying any details of the compressed data stored in the zip file. + =back =head2 Installation @@ -299,11 +302,10 @@ validate your packlists and even create a tarball from an installed module. L<perldoc|perldoc>, L<pod2man|pod2man>, L<perlpod>, L<pod2html|pod2html>, L<pod2usage|pod2usage>, L<podselect|podselect>, L<podchecker|podchecker>, L<splain|splain>, L<perldiag>, -L<roffitall|roffitall>, L<a2p|a2p>, L<s2p|s2p>, L<find2perl|find2perl>, +C<roffitall|roffitall>, L<a2p|a2p>, L<s2p|s2p>, L<find2perl|find2perl>, L<File::Find|File::Find>, L<pl2pm|pl2pm>, L<perlbug|perlbug>, -L<h2ph|h2ph>, L<c2ph|c2ph>, L<h2xs|h2xs>, L<dprofpp|dprofpp>, -L<Devel::DProf>, L<enc2xs>, L<xsubpp>, L<cpan>, L<cpanp>, L<cpan2dist>, -L<instmodsh>, L<piconv>, L<prove>, L<corelist>, L<ptar>, L<ptardiff>, -L<shasum> +L<h2ph|h2ph>, L<c2ph|c2ph>, L<h2xs|h2xs>, L<enc2xs>, L<xsubpp>, +L<cpan>, L<cpanp>, L<cpan2dist>, L<instmodsh>, L<piconv>, L<prove>, +L<corelist>, L<ptar>, L<ptardiff>, L<shasum>, L<zipdetails> =cut diff --git a/gnu/usr.bin/perl/pod/perlvms.pod b/gnu/usr.bin/perl/pod/perlvms.pod index 17175db3bd9..d88e6b14be7 100644 --- a/gnu/usr.bin/perl/pod/perlvms.pod +++ b/gnu/usr.bin/perl/pod/perlvms.pod @@ -265,14 +265,14 @@ created by an older version of an archive utility or a build utility such as MMK or MMS may generate a filename in all upper case even on an ODS-5 volume. If this filename is later retrieved by a Perl script or module in a case preserving environment, that upper case name may not -match the mixed-case or lower-case expections of the Perl code. Your +match the mixed-case or lower-case exceptions of the Perl code. Your best bet is to follow an all-or-nothing approach to case preservation: either don't use it at all, or make sure your entire toolchain and application environment support and use it. OpenVMS Alpha v7.3-1 and later and all version of OpenVMS I64 support case sensitivity as a process setting (see C<SET PROCESS -/CASE_LOOKUP=SENSITIVE>). Perl does not currently suppport case +/CASE_LOOKUP=SENSITIVE>). Perl does not currently support case sensitivity on VMS, but it may in the future, so Perl programs should use the C<< File::Spec->case_tolerant >> method to determine the state, and not the C<$^O> variable. @@ -409,7 +409,7 @@ internal Perl problems that would cause such a condition. This allows the programmer to look at the execution stack and variables to find out the cause of the exception. As the debugger is being invoked as the Perl interpreter is about to do a fatal exit, continuing the execution -in debug mode is usally not practical. +in debug mode is usually not practical. Starting Perl in the VMS debugger may change the program execution profile in a way that such problems are not reproduced. @@ -704,7 +704,6 @@ results of testing and further review. See L</"$?"> for a description of the encoding of the Unix value to produce a native VMS status containing it. - =item dump Rather than causing Perl to abort and dump core, the C<dump> @@ -848,10 +847,10 @@ Therefore, the "system time" elements will always be 0, since there is no difference between "user time" and "system" time under VMS, and the time accumulated by a subprocess may or may not appear separately in the "child time" field, depending on -whether L<times> keeps track of subprocesses separately. Note +whether C<times()> keeps track of subprocesses separately. Note especially that the VAXCRTL (at least) keeps track only of -subprocesses spawned using L<fork> and L<exec>; it will not -accumulate the times of subprocesses spawned via pipes, L<system>, +subprocesses spawned using C<fork()> and C<exec()>; it will not +accumulate the times of subprocesses spawned via pipes, C<system()>, or backticks. =item unlink LIST @@ -1188,7 +1187,7 @@ consequence of ignoring this advice will be undefined to allow future improvements in the POSIX exit handling. In general, with C<PERL_VMS_POSIX_EXIT> enabled, more detailed information -will be availble in the exit status for DCL scripts or other native VMS tools, +will be available in the exit status for DCL scripts or other native VMS tools, and will give the expected information for Posix programs. It has not been made the default in order to preserve backward compatibility. |