diff options
author | Todd C. Miller <millert@cvs.openbsd.org> | 2001-05-24 18:26:20 +0000 |
---|---|---|
committer | Todd C. Miller <millert@cvs.openbsd.org> | 2001-05-24 18:26:20 +0000 |
commit | 483d4e680bd2a6db14835b1b4d65be33488d532b (patch) | |
tree | 129a4c95425cb37ed928ef53a27eb7dce5de3345 /gnu/usr.bin/perl/pod | |
parent | 8757fe6728b9db37919ad703b336ebbbc84413aa (diff) |
stock perl 5.6.1
Diffstat (limited to 'gnu/usr.bin/perl/pod')
28 files changed, 9547 insertions, 287 deletions
diff --git a/gnu/usr.bin/perl/pod/Makefile.SH b/gnu/usr.bin/perl/pod/Makefile.SH new file mode 100644 index 00000000000..b8c8c8f24c7 --- /dev/null +++ b/gnu/usr.bin/perl/pod/Makefile.SH @@ -0,0 +1,167 @@ +case $CONFIG in +'') + if test -f config.sh; then TOP=.; + elif test -f ../config.sh; then TOP=..; + elif test -f ../../config.sh; then TOP=../..; + elif test -f ../../../config.sh; then TOP=../../..; + elif test -f ../../../../config.sh; then TOP=../../../..; + else + echo "Can't find config.sh."; exit 1 + fi + . $TOP/config.sh + ;; +esac +: This forces SH files to create target in same directory as SH file. +: This is so that make depend always knows where to find SH derivatives. +case "$0" in +*/*) cd `expr X$0 : 'X\(.*\)/'` ;; +esac + +if test -d pod; then + cd pod || exit 1 +fi +POD=`echo *.pod` +MAN=`echo $POD|sed 's/\.pod/\.man/g'` +HTML=`echo $POD|sed 's/perltoc.pod//'|sed 's/\.pod/\.html/g'` +TEX=`echo $POD|sed 's/\.pod/\.tex/g'` + +echo "Extracting pod/Makefile (with variable substitutions)" +: This section of the file will have variable substitutions done on it. +: Move anything that needs config subs from !NO!SUBS! section to !GROK!THIS!. +: Protect any dollar signs and backticks that you do not want interpreted +: by putting a backslash in front. You may delete these comments. + +$spitshell >Makefile <<!GROK!THIS! +# pod/Makefile +# This file is derived from pod/Makefile.SH. Any changes made here will +# be lost the next time you run Configure. + +POD = $POD + +MAN = $MAN + +# no perltoc.html +HTML = $HTML + +TEX = $TEX + +!GROK!THIS! + +## In the following dollars and backticks do not need the extra backslash. +$spitshell >>Makefile <<'!NO!SUBS!' + +CONVERTERS = pod2html pod2latex pod2man pod2text checkpods \ + pod2usage podchecker podselect + +HTMLROOT = / # Change this to fix cross-references in HTML +POD2HTML = pod2html \ + --htmlroot=$(HTMLROOT) \ + --podroot=.. --podpath=pod:lib:ext:vms \ + --libpods=perlfunc:perlguts:perlvar:perlrun:perlop + +PERL = ../miniperl +PERLILIB = $(PERL) -I../lib +REALPERL = ../perl + +all: $(CONVERTERS) man + +converters: $(CONVERTERS) + +regen_pods: perlmodlib.pod toc + +buildtoc: buildtoc.PL perl.pod ../MANIFEST + $(PERLILIB) buildtoc.PL + +perltoc.pod: buildtoc + +man: pod2man $(MAN) + +html: pod2html $(HTML) + +tex: pod2latex $(TEX) + +toc: buildtoc + $(PERLILIB) buildtoc + +.SUFFIXES: .pm .pod + +.SUFFIXES: .man + +.pm.man: pod2man + $(PERL) -I../lib pod2man $*.pm >$*.man + +.pod.man: pod2man + $(PERL) -I../lib pod2man $*.pod >$*.man + +.SUFFIXES: .html + +.pm.html: pod2html + $(PERL) -I../lib $(POD2HTML) --infile=$*.pm --outfile=$*.html + +.pod.html: pod2html + $(PERL) -I../lib $(POD2HTML) --infile=$*.pod --outfile=$*.html + +.SUFFIXES: .tex + +.pm.tex: pod2latex + $(PERL) -I../lib pod2latex $*.pm + +.pod.tex: pod2latex + $(PERL) -I../lib pod2latex $*.pod + +clean: + rm -f $(MAN) + rm -f $(HTML) + rm -f $(TEX) + rm -f pod2html-*cache + rm -f *.aux *.log *.exe + +realclean: clean + rm -f $(CONVERTERS) + +distclean: realclean + +veryclean: distclean + -rm -f *~ *.orig + +check: checkpods + @echo "checking..."; \ + $(PERL) -I../lib checkpods $(POD) + +# Dependencies. +pod2latex: pod2latex.PL ../lib/Config.pm + $(PERL) -I../lib pod2latex.PL + +pod2html: pod2html.PL ../lib/Config.pm + $(PERL) -I ../lib pod2html.PL + +pod2man: pod2man.PL ../lib/Config.pm + $(PERL) -I ../lib pod2man.PL + +pod2text: pod2text.PL ../lib/Config.pm + $(PERL) -I ../lib pod2text.PL + +checkpods: checkpods.PL ../lib/Config.pm + $(PERL) -I ../lib checkpods.PL + +pod2usage: pod2usage.PL ../lib/Config.pm + $(PERL) -I ../lib pod2usage.PL + +podchecker: podchecker.PL ../lib/Config.pm + $(PERL) -I ../lib podchecker.PL + +podselect: podselect.PL ../lib/Config.pm + $(PERL) -I ../lib podselect.PL + +perlmodlib.pod: $(PERL) perlmodlib.PL ../mv-if-diff + rm -f perlmodlib.tmp + $(PERL) -I ../lib perlmodlib.PL + sh ../mv-if-diff perlmodlib.tmp perlmodlib.pod + +compile: all + $(REALPERL) -I../lib ../utils/perlcc -o pod2latex.exe pod2latex -log ../compilelog + $(REALPERL) -I../lib ../utils/perlcc -o pod2man.exe pod2man -log ../compilelog + $(REALPERL) -I../lib ../utils/perlcc -o pod2text.exe pod2text -log ../compilelog + $(REALPERL) -I../lib ../utils/perlcc -o checkpods.exe checkpods -log ../compilelog + +!NO!SUBS! diff --git a/gnu/usr.bin/perl/pod/buildtoc.PL b/gnu/usr.bin/perl/pod/buildtoc.PL new file mode 100644 index 00000000000..7c5a45018e8 --- /dev/null +++ b/gnu/usr.bin/perl/pod/buildtoc.PL @@ -0,0 +1,492 @@ +#!/usr/local/bin/perl + +use Config; +use File::Basename qw(&basename &dirname); +use Cwd; + +# List explicitly here the variables you want Configure to +# generate. Metaconfig only looks for shell variables, so you +# have to mention them as if they were shell variables, not +# %Config entries. Thus you write +# $startperl +# to ensure Configure will look for $Config{startperl}. + +# This forces PL files to create target in same directory as PL file. +# This is so that make depend always knows where to find PL derivatives. +$origdir = cwd; +chdir(dirname($0)); +($file = basename($0)) =~ s/\.PL$//; +$file =~ s/\.pl$// if ($^O eq 'os2' or $^O eq 'dos'); # "case-forgiving" +$file =~ s/\.pl$/.com/ if ($^O eq 'VMS'); # "case-forgiving" + +open OUT,">$file" or die "Can't create $file: $!"; + +print "Extracting $file (with variable substitutions)\n"; + +# In this section, perl variables will be expanded during extraction. +# You can use $Config{...} to use Configure variables. + +print OUT <<"!GROK!THIS!"; +$Config{'startperl'} + eval 'exec perl -S \$0 "\$@"' + if 0; +!GROK!THIS! + +# In the following, perl variables are not expanded during extraction. + +print OUT <<'!NO!SUBS!'; + +# +# buildtoc +# +# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! +# This file is autogenerated by buildtoc.PL. +# Edit that file and run it to effect changes. +# +# Builds perltoc.pod and sanity checks the list of pods against all +# of the MANIFEST, perl.pod, and ourselves. +# + +use File::Find; +use Cwd; +use Text::Wrap; + +@PODS = glob("*.pod"); + +sub output ($); + +if (-d "pod") { + die "$0: failed to chdir('pod'): $!\n" unless chdir("pod"); +} + +@pods = qw( + perl + perlfaq + perltoc + perlbook + + perlsyn + perldata + perlop + perlsub + perlfunc + perlreftut + perldsc + perlrequick + perlpod + perlstyle + perltrap + + perlrun + perldiag + perllexwarn + perldebtut + perldebug + + perlvar + perllol + perlopentut + perlretut + + perlre + perlref + + perlform + + perlboot + perltoot + perltootc + perlobj + perlbot + perltie + + perlipc + perlfork + perlnumber + perlthrtut + + perlport + perllocale + perlunicode + perlebcdic + + perlsec + + perlmod + perlmodlib + perlmodinstall + perlnewmod + + perlfaq1 + perlfaq2 + perlfaq3 + perlfaq4 + perlfaq5 + perlfaq6 + perlfaq7 + perlfaq8 + perlfaq9 + + perlcompile + + perlembed + perldebguts + perlxstut + perlxs + perlclib + perlguts + perlcall + perlutil + perlfilter + perldbmfilter + perlapi + perlintern + perlapio + perltodo + perlhack + + perlhist + perldelta + perl5005delta + perl5004delta + + perlaix + perlamiga + perlbs2000 + perlcygwin + perldos + perlepoc + perlhpux + perlmachten + perlmacos + perlmpeix + perlos2 + perlos390 + perlsolaris + perlvmesa + perlvms + perlvos + perlwin32 + ); + +@ARCHPODS = qw( + perlaix + perlamiga + perlbs2000 + perlcygwin + perldos + perlepoc + perlhpux + perlmachten + perlmacos + perlmpeix + perlos2 + perlos390 + perlsolaris + perlvmesa + perlvms + perlvos + perlwin32 + ); +for (@ARCHPODS) { s/$/.pod/ } +@ARCHPODS{@ARCHPODS} = (); + +for (@pods) { s/$/.pod/ } +@pods{@pods} = (); +@PODS{@PODS} = (); + +open(MANI, "../MANIFEST") || die "$0: opening ../MANIFEST failed: $!"; +while (<MANI>) { + if (m!^pod/([^.]+\.pod)\s+!i) { + push @MANIPODS, $1; + } +} +close(MANI); +@MANIPODS{@MANIPODS} = (); + +open(PERLPOD, "perl.pod") || die "$0: opening perl.pod failed: $!\n"; +while (<PERLPOD>) { + if (/^For ease of access, /../^\(If you're intending /) { + if (/^\s+(perl\S*)\s+\w/) { + push @PERLPODS, "$1.pod"; + } + } +} +close(PERLPOD); +die "$0: could not find the pod listing of perl.pod\n" + unless @PERLPODS; +@PERLPODS{@PERLPODS} = (); + +# Cross-check against ourselves +# Cross-check against the MANIFEST +# Cross-check against the perl.pod + +foreach my $i (sort keys %PODS) { + warn "$0: $i exists but is unknown by buildtoc\n" + unless exists $pods{$i}; + warn "$0: $i exists but is unknown by ../MANIFEST\n" + if !exists $MANIPODS{$i} && !exists $ARCHPODS{$i}; + warn "$0: $i exists but is unknown by perl.pod\n" + unless exists $PERLPODS{$i}; +} +foreach my $i (sort keys %pods) { + warn "$0: $i is known by buildtoc but does not exist\n" + unless exists $PODS{$i}; +} +foreach my $i (sort keys %MANIPODS) { + warn "$0: $i is known by ../MANIFEST but does not exist\n" + unless exists $PODS{$i}; +} +foreach my $i (sort keys %PERLPODS) { + warn "$0: $i is known by perl.pod but does not exist\n" + unless exists $PODS{$i}; +} + +# We are ready to rock. +open(OUT, ">perltoc.pod") || die "$0: creating perltoc.pod failed: $!"; + +$/ = ''; +@ARGV = @pods; + +($_= <<EOPOD2B) =~ s/^\t//gm && output($_); + + =head1 NAME + + perltoc - perl documentation table of contents + + =head1 DESCRIPTION + + This page provides a brief table of contents for the rest of the Perl + documentation set. It is meant to be scanned quickly or grepped + through to locate the proper section you're looking for. + + =head1 BASIC DOCUMENTATION + +EOPOD2B +#' make emacs happy + +podset(@pods); + +find \&getpods => qw(../lib ../ext); + +sub getpods { + if (/\.p(od|m)$/) { + # Skip .pm files that have corresponding .pod files, and Functions.pm. + return if /(.*)\.pm$/ && -f "$1.pod"; + my $file = $File::Find::name; + return if $file eq '../lib/Pod/Functions.pm'; # Used only by pod itself + + die "tut $name" if $file =~ /TUT/; + unless (open (F, "< $_\0")) { + warn "bogus <$file>: $!"; + system "ls", "-l", $file; + } + else { + my $line; + while ($line = <F>) { + if ($line =~ /^=head1\s+NAME\b/) { + push @modpods, $file; + #warn "GOOD $file\n"; + return; + } + } + warn "$0: $file: cannot find =head1 NAME\n"; + } + } +} + +die "no pods" unless @modpods; + +for (@modpods) { + #($name) = /(\w+)\.p(m|od)$/; + $name = path2modname($_); + if ($name =~ /^[a-z]/) { + push @pragmata, $_; + } else { + if ($done{$name}++) { + # warn "already did $_\n"; + next; + } + push @modules, $_; + push @modname, $name; + } +} + +($_= <<EOPOD2B) =~ s/^\t//gm && output($_); + + + + =head1 PRAGMA DOCUMENTATION + +EOPOD2B + +podset(sort @pragmata); + +($_= <<EOPOD2B) =~ s/^\t//gm && output($_); + + + + =head1 MODULE DOCUMENTATION + +EOPOD2B + +podset( @modules[ sort { $modname[$a] cmp $modname[$b] } 0 .. $#modules ] ); + +($_= <<EOPOD2B) =~ s/^\t//gm; + + + =head1 AUXILIARY DOCUMENTATION + + Here should be listed all the extra programs' documentation, but they + don't all have manual pages yet: + + =over 4 + + =item a2p + + =item s2p + + =item find2perl + + =item h2ph + + =item c2ph + + =item h2xs + + =item xsubpp + + =item pod2man + + =item wrapsuid + + =back + + =head1 AUTHOR + + Larry Wall <F<larry\@wall.org>>, with the help of oodles + of other folks. + + +EOPOD2B +output $_; +output "\n"; # flush $LINE +exit; + +sub podset { + local @ARGV = @_; + + while(<>) { + if (s/^=head1 (NAME)\s*/=head2 /) { + $pod = path2modname($ARGV); + unhead1(); + output "\n \n\n=head2 "; + $_ = <>; + if ( /^\s*$pod\b/ ) { + s/$pod\.pm/$pod/; # '.pm' in NAME !? + output $_; + } else { + s/^/$pod, /; + output $_; + } + next; + } + if (s/^=head1 (.*)/=item $1/) { + unhead2(); + output "=over 4\n\n" unless $inhead1; + $inhead1 = 1; + output $_; nl(); next; + } + if (s/^=head2 (.*)/=item $1/) { + unitem(); + output "=over 4\n\n" unless $inhead2; + $inhead2 = 1; + output $_; nl(); next; + } + if (s/^=item ([^=].*)/$1/) { + next if $pod eq 'perldiag'; + s/^\s*\*\s*$// && next; + s/^\s*\*\s*//; + s/\n/ /g; + s/\s+$//; + next if /^[\d.]+$/; + next if $pod eq 'perlmodlib' && /^ftp:/; + ##print "=over 4\n\n" unless $initem; + output ", " if $initem; + $initem = 1; + s/\.$//; + s/^-X\b/-I<X>/; + output $_; next; + } + if (s/^=cut\s*\n//) { + unhead1(); + next; + } + } +} + +sub path2modname { + local $_ = shift; + s/\.p(m|od)$//; + s-.*?/(lib|ext)/--; + s-/-::-g; + s/(\w+)::\1/$1/; + return $_; +} + +sub unhead1 { + unhead2(); + if ($inhead1) { + output "\n\n=back\n\n"; + } + $inhead1 = 0; +} + +sub unhead2 { + unitem(); + if ($inhead2) { + output "\n\n=back\n\n"; + } + $inhead2 = 0; +} + +sub unitem { + if ($initem) { + output "\n\n"; + ##print "\n\n=back\n\n"; + } + $initem = 0; +} + +sub nl { + output "\n"; +} + +my $NEWLINE; # how many newlines have we seen recently +my $LINE; # what remains to be printed + +sub output ($) { + for (split /(\n)/, shift) { + if ($_ eq "\n") { + if ($LINE) { + print OUT wrap('', '', $LINE); + $LINE = ''; + } + if ($NEWLINE < 2) { + print OUT; + $NEWLINE++; + } + } + elsif (/\S/ && length) { + $LINE .= $_; + $NEWLINE = 0; + } + } +} + +!NO!SUBS! + +close OUT or die "Can't close $file: $!"; +chmod 0755, $file or die "Can't reset permissions for $file: $!\n"; +exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':'; +chdir $origdir; diff --git a/gnu/usr.bin/perl/pod/perl5004delta.pod b/gnu/usr.bin/perl/pod/perl5004delta.pod index 85a8f96161b..429cba93ced 100644 --- a/gnu/usr.bin/perl/pod/perl5004delta.pod +++ b/gnu/usr.bin/perl/pod/perl5004delta.pod @@ -24,7 +24,10 @@ problems. See the F<Changes> file in the distribution for details. C<%ENV = ()> and C<%ENV = @list> now work as expected (except on VMS where it generates a fatal error). -=head2 "Can't locate Foo.pm in @INC" error now lists @INC +=head2 Change to "Can't locate Foo.pm in @INC" error + +The error "Can't locate Foo.pm in @INC" now lists the contents of @INC +for easier debugging. =head2 Compilation option: Binary compatibility with 5.003 @@ -198,7 +201,7 @@ hole was just plugged. The new restrictions when tainting include: -=over +=over 4 =item No glob() or <*> @@ -258,7 +261,7 @@ the F<INSTALL> file for how to use it. =head2 New and changed syntax -=over +=over 4 =item $coderef->(PARAMS) @@ -276,7 +279,7 @@ S<C<< $table->{FOO}->($bar) >>>. =head2 New and changed builtin constants -=over +=over 4 =item __PACKAGE__ @@ -289,7 +292,7 @@ into strings. =head2 New and changed builtin variables -=over +=over 4 =item $^E @@ -322,7 +325,7 @@ there is no C<use English> long name for this variable. =head2 New and changed builtin functions -=over +=over 4 =item delete on slices @@ -544,7 +547,7 @@ subroutine: The C<UNIVERSAL> package automatically contains the following methods that are inherited by all other classes: -=over +=over 4 =item isa(CLASS) @@ -593,7 +596,7 @@ have C<isa> available as a plain subroutine in the current package. See L<perltie> for other kinds of tie()s. -=over +=over 4 =item TIEHANDLE classname, LIST @@ -687,7 +690,7 @@ install the optional module Devel::Peek.) Three new compilation flags are recognized by malloc.c. (They have no effect if perl is compiled with system malloc().) -=over +=over 4 =item -DPERL_EMERGENCY_SBRK @@ -779,7 +782,7 @@ See F<README.amigaos> in the perl distribution. Six new pragmatic modules exist: -=over +=over 4 =item use autouse MODULE => qw(sub1 sub2 sub3) @@ -810,7 +813,7 @@ builtin operations. When C<use locale> is in effect, the current LC_CTYPE locale is used for regular expressions and case mapping; LC_COLLATE for string -ordering; and LC_NUMERIC for numeric formating in printf and sprintf +ordering; and LC_NUMERIC for numeric formatting in printf and sprintf (but B<not> in print). LC_NUMERIC is always used in write, since lexical scoping of formats is problematic at best. @@ -979,7 +982,7 @@ those who need trigonometric functions only for real numbers. There have been quite a few changes made to DB_File. Here are a few of the highlights: -=over +=over 4 =item * @@ -1045,7 +1048,7 @@ For example, you can now say =head2 pod2html -=over +=over 4 =item Sends converted HTML to standard output @@ -1058,7 +1061,7 @@ Use the B<--outfile=FILENAME> option to write to a file. =head2 xsubpp -=over +=over 4 =item C<void> XSUBs now default to returning nothing @@ -1083,7 +1086,7 @@ XSUB's return type is really C<SV *>. =head1 C Language API Changes -=over +=over 4 =item C<gv_fetchmethod> and C<perl_call_sv> @@ -1124,7 +1127,7 @@ which can be more efficient. See L<perlguts> for details. Many of the base and library pods were updated. These new pods are included in section 1: -=over +=over 4 =item L<perldelta> @@ -1177,7 +1180,7 @@ increasing order of desperation): (X) A very fatal error (nontrappable). (A) An alien error message (not generated by Perl). -=over +=over 4 =item "my" variable %s masks earlier declaration in same scope @@ -1429,7 +1432,7 @@ assigning to it and when evaluating its argument, while C<@foo{&bar}> behaves like a list when you assign to it, and provides a list context to its subscript, which can do weird things if you're expecting only one subscript. -=item Stub found while resolving method `%s' overloading `%s' in package `%s' +=item Stub found while resolving method `%s' overloading `%s' in %s (P) Overloading resolution over @ISA tree may be broken by importing stubs. Stubs should never be implicitly created, but explicit calls to C<can> diff --git a/gnu/usr.bin/perl/pod/perl5005delta.pod b/gnu/usr.bin/perl/pod/perl5005delta.pod index b133c0dd813..78bf90f616b 100644 --- a/gnu/usr.bin/perl/pod/perl5005delta.pod +++ b/gnu/usr.bin/perl/pod/perl5005delta.pod @@ -63,11 +63,15 @@ the new features in this release. =over 4 -=item Core sources now require ANSI C compiler +=item * + +Core sources now require ANSI C compiler An ANSI C compiler is now B<required> to build perl. See F<INSTALL>. -=item All Perl global variables must now be referenced with an explicit prefix +=item * + +All Perl global variables must now be referenced with an explicit prefix All Perl global variables that are visible for use by extensions now have a C<PL_> prefix. New extensions should C<not> refer to perl globals @@ -87,7 +91,9 @@ support may cease in a future release. See L<perlguts/"API LISTING">. -=item Enabling threads has source compatibility issues +=item * + +Enabling threads has source compatibility issues Perl built with threading enabled requires extensions to use the new C<dTHR> macro to initialize the handle to access per-thread data. @@ -525,7 +531,7 @@ The hints files for most Unix platforms have seen incremental improvements. =head2 New Modules -=over +=over 4 =item B @@ -596,13 +602,15 @@ Various pragmata to control behavior of regular expressions. =head2 Changes in existing modules -=over +=over 4 =item Benchmark You can now run tests for I<x> seconds instead of guessing the right number of tests to run. +Keeps better time. + =item Carp Carp has a new function cluck(). cluck() warns, like carp(), but also adds @@ -660,10 +668,6 @@ See <perlmodinstall> and L<CPAN>. Cwd::cwd is faster on most platforms. -=item Benchmark - -Keeps better time. - =back =head1 Utility Changes @@ -702,7 +706,7 @@ L<perlthrtut> gives a tutorial on threads. =head1 New Diagnostics -=over +=over 4 =item Ambiguous call resolved as CORE::%s(), qualify as such or use & @@ -859,7 +863,7 @@ are outside the range which can be represented by integers internally. One possible workaround is to force Perl to use magical string increment by prepending "0" to your numbers. -=item Recursive inheritance detected while looking for method '%s' in package '%s' +=item Recursive inheritance detected while looking for method '%s' %s (F) More than 100 levels of inheritance were encountered while invoking a method. Probably indicates an unintended loop in your inheritance hierarchy. @@ -916,7 +920,7 @@ fix the problem can be found in L<perllocale/"LOCALE PROBLEMS">. =head1 Obsolete Diagnostics -=over +=over 4 =item Can't mktemp() diff --git a/gnu/usr.bin/perl/pod/perlboot.pod b/gnu/usr.bin/perl/pod/perlboot.pod index b549f45e490..3c18246f0ca 100644 --- a/gnu/usr.bin/perl/pod/perlboot.pod +++ b/gnu/usr.bin/perl/pod/perlboot.pod @@ -790,9 +790,13 @@ Hopefully, this gets you started, though. For more information, see L<perlobj> (for all the gritty details about Perl objects, now that you've seen the basics), L<perltoot> (the -tutorial for those who already know objects), L<perlbot> (for some -more tricks), and books such as Damian Conway's excellent I<Object -Oriented Perl>. +tutorial for those who already know objects), L<perltootc> (dealing +with class data), L<perlbot> (for some more tricks), and books such as +Damian Conway's excellent I<Object Oriented Perl>. + +Some modules which might prove interesting are Class::Accessor, +Class::Class, Class::Contract, Class::Data::Inheritable, +Class::MethodMaker and Tie::SecureHash =head1 COPYRIGHT diff --git a/gnu/usr.bin/perl/pod/perlclib.pod b/gnu/usr.bin/perl/pod/perlclib.pod new file mode 100644 index 00000000000..a0f4a80eecd --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlclib.pod @@ -0,0 +1,197 @@ +=head1 NAME + +perlclib - Internal replacements for standard C library functions + +=head1 DESCRIPTION + +One thing Perl porters should note is that F<perl> doesn't tend to use that +much of the C standard library internally; you'll see very little use of, +for example, the F<ctype.h> functions in there. This is because Perl +tends to reimplement or abstract standard library functions, so that we +know exactly how they're going to operate. + +This is a reference card for people who are familiar with the C library +and who want to do things the Perl way; to tell them which functions +they ought to use instead of the more normal C functions. + +=head2 Conventions + +In the following tables: + +=over 3 + +=item C<t> + +is a type. + +=item C<p> + +is a pointer. + +=item C<n> + +is a number. + +=item C<s> + +is a string. + +=back + +C<sv>, C<av>, C<hv>, etc. represent variables of their respective types. + +=head2 File Operations + +Instead of the F<stdio.h> functions, you should use the Perl abstraction +layer. Instead of C<FILE*> types, you need to be handling C<PerlIO*> +types. Don't forget that with the new PerlIO layered I/O abstraction +C<FILE*> types may not even be available. See also the C<perlapio> +documentation for more information about the following functions: + + Instead Of: Use: + + stdin PerlIO_stdin() + stdout PerlIO_stdout() + stderr PerlIO_stderr() + + fopen(fn, mode) PerlIO_open(fn, mode) + freopen(fn, mode, stream) PerlIO_reopen(fn, mode, perlio) (Deprecated) + fflush(stream) PerlIO_flush(perlio) + fclose(stream) PerlIO_close(perlio) + +=head2 File Input and Output + + Instead Of: Use: + + fprintf(stream, fmt, ...) PerlIO_printf(perlio, fmt, ...) + + [f]getc(stream) PerlIO_getc(perlio) + [f]putc(stream, n) PerlIO_putc(perlio, n) + ungetc(n, stream) PerlIO_ungetc(perlio, n) + +Note that the PerlIO equivalents of C<fread> and C<fwrite> are slightly +different from their C library counterparts: + + fread(p, size, n, stream) PerlIO_read(perlio, buf, numbytes) + fwrite(p, size, n, stream) PerlIO_write(perlio, buf, numbytes) + + fputs(s, stream) PerlIO_puts(perlio, s) + +There is no equivalent to C<fgets>; one should use C<sv_gets> instead: + + fgets(s, n, stream) sv_gets(sv, perlio, append) + +=head2 File Positioning + + Instead Of: Use: + + feof(stream) PerlIO_eof(perlio) + fseek(stream, n, whence) PerlIO_seek(perlio, n, whence) + rewind(stream) PerlIO_rewind(perlio) + + fgetpos(stream, p) PerlIO_getpos(perlio, sv) + fsetpos(stream, p) PerlIO_setpos(perlio, sv) + + ferror(stream) PerlIO_error(perlio) + clearerr(stream) PerlIO_clearerr(perlio) + +=head2 Memory Management and String Handling + + Instead Of: Use: + + t* p = malloc(n) New(id, p, n, t) + t* p = calloc(n, s) Newz(id, p, n, t) + p = realloc(p, n) Renew(p, n, t) + memcpy(dst, src, n) Copy(src, dst, n, t) + memmove(dst, src, n) Move(src, dst, n, t) + memcpy/*(struct foo *) StructCopy(src, dst, t) + free(p) Safefree(p) + + strdup(p) savepv(p) + strndup(p, n) savepvn(p, n) (Hey, strndup doesn't exist!) + + strstr(big, little) instr(big, little) + strcmp(s1, s2) strLE(s1, s2) / strEQ(s1, s2) / strGT(s1,s2) + strncmp(s1, s2, n) strnNE(s1, s2, n) / strnEQ(s1, s2, n) + +Notice the different order of arguments to C<Copy> and C<Move> than used +in C<memcpy> and C<memmove>. + +Most of the time, though, you'll want to be dealing with SVs internally +instead of raw C<char *> strings: + + strlen(s) sv_len(sv) + strcpy(dt, src) sv_setpv(sv, s) + strncpy(dt, src, n) sv_setpvn(sv, s, n) + strcat(dt, src) sv_catpv(sv, s) + strncat(dt, src) sv_catpvn(sv, s) + sprintf(s, fmt, ...) sv_setpvf(sv, fmt, ...) + +Note also the existence of C<sv_catpvf> and C<sv_catpvfn>, combining +concatenation with formatting. + +=head2 Character Class Tests + +There are two types of character class tests that Perl implements: one +type deals in C<char>s and are thus B<not> Unicode aware (and hence +deprecated unless you B<know> you should use them) and the other type +deal in C<UV>s and know about Unicode properties. In the following +table, C<c> is a C<char>, and C<u> is a Unicode codepoint. + + Instead Of: Use: But better use: + + isalnum(c) isALNUM(c) isALNUM_uni(u) + isalpha(c) isALPHA(c) isALPHA_uni(u) + iscntrl(c) isCNTRL(c) isCNTRL_uni(u) + isdigit(c) isDIGIT(c) isDIGIT_uni(u) + isgraph(c) isGRAPH(c) isGRAPH_uni(u) + islower(c) isLOWER(c) isLOWER_uni(u) + isprint(c) isPRINT(c) isPRINT_uni(u) + ispunct(c) isPUNCT(c) isPUNCT_uni(u) + isspace(c) isSPACE(c) isSPACE_uni(u) + isupper(c) isUPPER(c) isUPPER_uni(u) + isxdigit(c) isXDIGIT(c) isXDIGIT_uni(u) + + tolower(c) toLOWER(c) toLOWER_uni(u) + toupper(c) toUPPER(c) toUPPER_uni(u) + +=head2 F<stdlib.h> functions + + Instead Of: Use: + + atof(s) Atof(s) + atol(s) Atol(s) + strtod(s, *p) Nothing. Just don't use it. + strtol(s, *p, n) Strtol(s, *p, n) + strtoul(s, *p, n) Strtoul(s, *p, n) + +Notice also the C<scan_bin>, C<scan_hex>, and C<scan_oct> functions in +F<util.c> for converting strings representing numbers in the respective +bases into C<NV>s. + +In theory C<Strtol> and C<Strtoul> may not be defined if the machine perl is +built on doesn't actually have strtol and strtoul. But as those 2 +functions are part of the 1989 ANSI C spec we suspect you'll find them +everywhere by now. + + int rand() double Drand01() + srand(n) { seedDrand01((Rand_seed_t)n); + PL_srand_called = TRUE; } + + exit(n) my_exit(n) + system(s) Don't. Look at pp_system or use my_popen + + getenv(s) PerlEnv_getenv(s) + setenv(s, val) my_putenv(s, val) + +=head2 Miscellaneous functions + +You should not even B<want> to use F<setjmp.h> functions, but if you +think you do, use the C<JMPENV> stack in F<scope.h> instead. + +For C<signal>/C<sigaction>, use C<rsignal(signo, handler)>. + +=head1 SEE ALSO + +C<perlapi>, C<perlapio>, C<perlguts> + diff --git a/gnu/usr.bin/perl/pod/perlcompile.pod b/gnu/usr.bin/perl/pod/perlcompile.pod index 697cb80d409..282592e9fb1 100644 --- a/gnu/usr.bin/perl/pod/perlcompile.pod +++ b/gnu/usr.bin/perl/pod/perlcompile.pod @@ -183,9 +183,6 @@ one-liners: rename $was, $_ unless $was eq $_; } -(this is the I<rename> program that comes in the I<eg/> directory -of the Perl source distribution). - The decompiler has several options for the code it generates. For instance, you can set the size of each indent from 4 (as above) to 2 with: @@ -308,7 +305,7 @@ I<assemble> program that produces bytecode. This module is used by the B::CC back end. It walks "basic blocks". A basic block is a series of operations which is known to execute from -start to finish, with no possiblity of branching or halting. +start to finish, with no possibility of branching or halting. =item B::Bytecode @@ -369,12 +366,12 @@ can identify. See L</"The Lint Back End"> for details about usage. =item B::Showlex This module prints out the my() variables used in a function or a -file. To gt a list of the my() variables used in the subroutine +file. To get a list of the my() variables used in the subroutine mysub() defined in the file myperlprogram: $ perl -MO=Showlex,mysub myperlprogram -To gt a list of the my() variables used in the file myperlprogram: +To get a list of the my() variables used in the file myperlprogram: $ perl -MO=Showlex myperlprogram @@ -419,7 +416,7 @@ names. The optimized C backend outputs code for more modules than it should (e.g., DirHandle). It also has little hope of properly handling -C<goto LABEL> outside the running subroutine (C<goto &sub> is ok). +C<goto LABEL> outside the running subroutine (C<goto &sub> is okay). C<goto LABEL> currently does not work at all in this backend. It also creates a huge initialization function that gives C compilers headaches. Splitting the initialization function gives diff --git a/gnu/usr.bin/perl/pod/perldbmfilter.pod b/gnu/usr.bin/perl/pod/perldbmfilter.pod index 3350596aab8..8384999e6a7 100644 --- a/gnu/usr.bin/perl/pod/perldbmfilter.pod +++ b/gnu/usr.bin/perl/pod/perldbmfilter.pod @@ -124,7 +124,7 @@ Here is another real-life example. By default, whenever Perl writes to a DBM database it always writes the key and value as strings. So when you use this: - $hash{12345} = "soemthing" ; + $hash{12345} = "something" ; the key 12345 will get stored in the DBM database as the 5 byte string "12345". If you actually want the key to be stored in the DBM database diff --git a/gnu/usr.bin/perl/pod/perldebguts.pod b/gnu/usr.bin/perl/pod/perldebguts.pod index b74f3efb6ba..20cc5460fd4 100644 --- a/gnu/usr.bin/perl/pod/perldebguts.pod +++ b/gnu/usr.bin/perl/pod/perldebguts.pod @@ -13,17 +13,17 @@ intimate with Perl's guts to understand. Caveat lector. Perl has special debugging hooks at compile-time and run-time used to create debugging environments. These hooks are not to be confused -with the I<perl -Dxxx> command described in L<perlrun>, which are -usable only if a special Perl built per the instructions the +with the I<perl -Dxxx> command described in L<perlrun>, which is +usable only if a special Perl is built per the instructions in the F<INSTALL> podpage in the Perl source tree. For example, whenever you call Perl's built-in C<caller> function from the package DB, the arguments that the corresponding stack -frame was called with are copied to the the @DB::args array. The +frame was called with are copied to the @DB::args array. The general mechanisms is enabled by calling Perl with the B<-d> switch, the following additional features are enabled (cf. L<perlvar/$^P>): -=over +=over 4 =item * @@ -32,20 +32,22 @@ Perl inserts the contents of C<$ENV{PERL5DB}> (or C<BEGIN {require =item * -The array C<@{"_<$filename"}> holds the lines of $filename for all -files compiled by Perl. The same for C<eval>ed strings that contain +Each array C<@{"_<$filename"}> holds the lines of $filename for a +file compiled by Perl. The same for C<eval>ed strings that contain subroutines, or which are currently being executed. The $filename for C<eval>ed strings looks like C<(eval 34)>. Code assertions -in regexes look like C<(re_eval 19)>. +in regexes look like C<(re_eval 19)>. + +Values in this array are magical in numeric context: they compare +equal to zero only if the line is not breakable. =item * -The hash C<%{"_<$filename"}> contains breakpoints and actions keyed +Each hash C<%{"_<$filename"}> contains breakpoints and actions keyed by line number. Individual entries (as opposed to the whole hash) are settable. Perl only cares about Boolean true here, although the values used by F<perl5db.pl> have the form -C<"$break_condition\0$action">. Values in this hash are magical -in numeric context: they are zeros if the line is not breakable. +C<"$break_condition\0$action">. The same holds for evaluated strings that contain subroutines, or which are currently being executed. The $filename for C<eval>ed strings @@ -53,7 +55,7 @@ looks like C<(eval 34)> or C<(re_eval 19)>. =item * -The scalar C<${"_<$filename"}> contains C<"_<$filename">. This is +Each scalar C<${"_<$filename"}> contains C<"_<$filename">. This is also the case for evaluated strings that contain subroutines, or which are currently being executed. The $filename for C<eval>ed strings looks like C<(eval 34)> or C<(re_eval 19)>. @@ -154,7 +156,7 @@ L<perldebug/"Options"> for description of options parsed by C<DB::parse_options(string)>. The function C<DB::dump_trace(skip[, count])> skips the specified number of frames and returns a list containing information about the calling frames (all of them, if -C<count> is missing). Each entry is reference to a a hash with +C<count> is missing). Each entry is reference to a hash with keys C<context> (either C<.>, C<$>, or C<@>), C<sub> (subroutine name, or info about C<eval>), C<args> (C<undef> or a reference to an array), C<file>, and C<line>. @@ -400,7 +402,7 @@ shorter than 7 chars. The fields of interest which may appear in the last line are -=over +=over 4 =item C<anchored> I<STRING> C<at> I<POS> @@ -630,7 +632,7 @@ Perl is a profligate wastrel when it comes to memory use. There is a saying that to estimate memory usage of Perl, assume a reasonable algorithm for memory allocation, multiply that estimate by 10, and while you still may miss the mark, at least you won't be quite so -astonished. This is not absolutely true, but may prvide a good +astonished. This is not absolutely true, but may provide a good grasp of what happens. Assume that an integer cannot take less than 20 bytes of memory, a @@ -639,7 +641,7 @@ than 32 bytes (all these examples assume 32-bit architectures, the result are quite a bit worse on 64-bit architectures). If a variable is accessed in two of three different ways (which require an integer, a float, or a string), the memory footprint may increase yet another -20 bytes. A sloppy malloc(3) implementation can make inflate these +20 bytes. A sloppy malloc(3) implementation can inflate these numbers dramatically. On the opposite end of the scale, a declaration like @@ -666,7 +668,7 @@ the top level of the Perl source tree. If your perl is using Perl's malloc() and was compiled with the necessary switches (this is the default), then it will print memory -usage statistics after compiling your code hwen C<< $ENV{PERL_DEBUG_MSTATS} +usage statistics after compiling your code when C<< $ENV{PERL_DEBUG_MSTATS} > 1 >>, and before termination of the program when C<< $ENV{PERL_DEBUG_MSTATS} >= 1 >>. The report format is similar to the following example: @@ -686,12 +688,12 @@ the following example: Total sbrk(): 215040/47:145. Odd ends: pad+heads+chain+tail: 0+2192+0+6144. It is possible to ask for such a statistic at arbitrary points in -your execution using the mstats() function out of the standard +your execution using the mstat() function out of the standard Devel::Peek module. Here is some explanation of that format: -=over +=over 4 =item C<buckets SMALLEST(APPROX)..GREATEST(APPROX)> @@ -720,7 +722,7 @@ of two--or possibly one page greater. In the second row, if present, the memory footprints of the buckets are between the memory footprints of two buckets "above". -For example, suppose under the pervious example, the memory footprints +For example, suppose under the previous example, the memory footprints were free: 8 16 32 64 128 256 512 1024 2048 4096 8192 @@ -804,7 +806,7 @@ To see this list, insert two C<warn('!...')> statements around the call: do 'lib/auto/POSIX/autosplit.ix'; warn('!!! "after"'); -and run it with PErl's B<-DL> option. The first warn() will print +and run it with Perl's B<-DL> option. The first warn() will print memory allocation info before parsing the file and will memorize the statistics at this point (we ignore what it prints). The second warn() prints increments with respect to these memorized data. This @@ -838,11 +840,11 @@ per glob - for glob name, and glob stringification magic. Here are explanations for other I<Id>s above: -=over +=over 4 =item C<717> -CReates bigger C<XPV*> structures. In the case above, it +Creates bigger C<XPV*> structures. In the case above, it creates 3 C<AV>s per subroutine, one for a list of lexical variable names, one for a scratchpad (which contains lexical variables and C<targets>), and one for the array of scratchpads needed for @@ -892,7 +894,7 @@ these categories. If warn() string starts with -=over +=over 4 =item C<!!!> diff --git a/gnu/usr.bin/perl/pod/perldebtut.pod b/gnu/usr.bin/perl/pod/perldebtut.pod new file mode 100644 index 00000000000..e11102e5676 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perldebtut.pod @@ -0,0 +1,721 @@ +=head1 NAME + +perldebtut - Perl debugging tutorial + +=head1 DESCRIPTION + +A (very) lightweight introduction in the use of the perl debugger, and a +pointer to existing, deeper sources of information on the subject of debugging +perl programs. + +There's an extraordinary number of people out there who don't appear to know +anything about using the perl debugger, though they use the language every +day. +This is for them. + + +=head1 use strict + +First of all, there's a few things you can do to make your life a lot more +straightforward when it comes to debugging perl programs, without using the +debugger at all. To demonstrate, here's a simple script with a problem: + + #!/usr/bin/perl + + $var1 = 'Hello World'; # always wanted to do that :-) + $var2 = "$varl\n"; + + print $var2; + exit; + +While this compiles and runs happily, it probably won't do what's expected, +namely it doesn't print "Hello World\n" at all; It will on the other hand do +exactly what it was told to do, computers being a bit that way inclined. That +is, it will print out a newline character, and you'll get what looks like a +blank line. It looks like there's 2 variables when (because of the typo) +there's really 3: + + $var1 = 'Hello World' + $varl = undef + $var2 = "\n" + +To catch this kind of problem, we can force each variable to be declared +before use by pulling in the strict module, by putting 'use strict;' after the +first line of the script. + +Now when you run it, perl complains about the 3 undeclared variables and we +get four error messages because one variable is referenced twice: + + Global symbol "$var1" requires explicit package name at ./t1 line 4. + Global symbol "$var2" requires explicit package name at ./t1 line 5. + Global symbol "$varl" requires explicit package name at ./t1 line 5. + Global symbol "$var2" requires explicit package name at ./t1 line 7. + Execution of ./hello aborted due to compilation errors. + +Luvverly! and to fix this we declare all variables explicitly and now our +script looks like this: + + #!/usr/bin/perl + use strict; + + my $var1 = 'Hello World'; + my $varl = ''; + my $var2 = "$varl\n"; + + print $var2; + exit; + +We then do (always a good idea) a syntax check before we try to run it again: + + > perl -c hello + hello syntax OK + +And now when we run it, we get "\n" still, but at least we know why. Just +getting this script to compile has exposed the '$varl' (with the letter 'l) +variable, and simply changing $varl to $var1 solves the problem. + + +=head1 Looking at data and -w and w + +Ok, but how about when you want to really see your data, what's in that +dynamic variable, just before using it? + + #!/usr/bin/perl + use strict; + + my $key = 'welcome'; + my %data = ( + 'this' => qw(that), + 'tom' => qw(and jerry), + 'welcome' => q(Hello World), + 'zip' => q(welcome), + ); + my @data = keys %data; + + print "$data{$key}\n"; + exit; + +Looks OK, after it's been through the syntax check (perl -c scriptname), we +run it and all we get is a blank line again! Hmmmm. + +One common debugging approach here, would be to liberally sprinkle a few print +statements, to add a check just before we print out our data, and another just +after: + + print "All OK\n" if grep($key, keys %data); + print "$data{$key}\n"; + print "done: '$data{$key}'\n"; + +And try again: + + > perl data + All OK + + done: '' + +After much staring at the same piece of code and not seeing the wood for the +trees for some time, we get a cup of coffee and try another approach. That +is, we bring in the cavalry by giving perl the 'B<-d>' switch on the command +line: + + > perl -d data + Default die handler restored. + + Loading DB routines from perl5db.pl version 1.07 + Editor support available. + + Enter h or `h h' for help, or `man perldebug' for more help. + + main::(./data:4): my $key = 'welcome'; + +Now, what we've done here is to launch the built-in perl debugger on our +script. It's stopped at the first line of executable code and is waiting for +input. + +Before we go any further, you'll want to know how to quit the debugger: use +just the letter 'B<q>', not the words 'quit' or 'exit': + + DB<1> q + > + +That's it, you're back on home turf again. + + +=head1 help + +Fire the debugger up again on your script and we'll look at the help menu. +There's a couple of ways of calling help: a simple 'B<h>' will get you a long +scrolled list of help, 'B<|h>' (pipe-h) will pipe the help through your pager +('more' or 'less' probably), and finally, 'B<h h>' (h-space-h) will give you a +helpful mini-screen snapshot: + + DB<1> h h + List/search source lines: Control script execution: + l [ln|sub] List source code T Stack trace + - or . List previous/current line s [expr] Single step [in expr] + w [line] List around line n [expr] Next, steps over subs + f filename View source in file <CR/Enter> Repeat last n or s + /pattern/ ?patt? Search forw/backw r Return from subroutine + v Show versions of modules c [ln|sub] Continue until position + Debugger controls: L List +break/watch/actions + O [...] Set debugger options t [expr] Toggle trace [trace expr] + <[<]|{[{]|>[>] [cmd] Do pre/post-prompt b [ln|event|sub] [cnd] Set breakpoint + ! [N|pat] Redo a previous command d [ln] or D Delete a/all breakpoints + H [-num] Display last num commands a [ln] cmd Do cmd before line + = [a val] Define/list an alias W expr Add a watch expression + h [db_cmd] Get help on command A or W Delete all actions/watch + |[|]db_cmd Send output to pager ![!] syscmd Run cmd in a subprocess + q or ^D Quit R Attempt a restart + Data Examination: expr Execute perl code, also see: s,n,t expr + x|m expr Evals expr in list context, dumps the result or lists methods. + p expr Print expression (uses script's current package). + S [[!]pat] List subroutine names [not] matching pattern + V [Pk [Vars]] List Variables in Package. Vars can be ~pattern or !pattern. + X [Vars] Same as "V current_package [Vars]". + For more help, type h cmd_letter, or run man perldebug for all docs. + +More confusing options than you can shake a big stick at! It's not as bad as +it looks and it's very useful to know more about all of it, and fun too! + +There's a couple of useful ones to know about straight away. You wouldn't +think we're using any libraries at all at the moment, but 'B<v>' will show +which modules are currently loaded, by the debugger as well your script. +'B<V>' and 'B<X>' show variables in the program by package scope and can be +constrained by pattern. 'B<m>' shows methods and 'B<S>' shows all subroutines +(by pattern): + + DB<2>S str + dumpvar::stringify + strict::bits + strict::import + strict::unimport + +Using 'X' and cousins requires you not to use the type identifiers ($@%), just +the 'name': + + DM<3>X ~err + FileHandle(stderr) => fileno(2) + +Remember we're in our tiny program with a problem, we should have a look at +where we are, and what our data looks like. First of all let's have a window +on our present position (the first line of code in this case), via the letter +'B<w>': + + DB<4> w + 1 #!/usr/bin/perl + 2: use strict; + 3 + 4==> my $key = 'welcome'; + 5: my %data = ( + 6 'this' => qw(that), + 7 'tom' => qw(and jerry), + 8 'welcome' => q(Hello World), + 9 'zip' => q(welcome), + 10 ); + +At line number 4 is a helpful pointer, that tells you where you are now. To +see more code, type 'w' again: + + DB<4> w + 8 'welcome' => q(Hello World), + 9 'zip' => q(welcome), + 10 ); + 11: my @data = keys %data; + 12: print "All OK\n" if grep($key, keys %data); + 13: print "$data{$key}\n"; + 14: print "done: '$data{$key}'\n"; + 15: exit; + +And if you wanted to list line 5 again, type 'l 5', (note the space): + + DB<4> l 5 + 5: my %data = ( + +In this case, there's not much to see, but of course normally there's pages of +stuff to wade through, and 'l' can be very useful. To reset your view to the +line we're about to execute, type a lone period '.': + + DB<5> . + main::(./data_a:4): my $key = 'welcome'; + +The line shown is the one that is about to be executed B<next>, it hasn't +happened yet. So while we can print a variable with the letter 'B<p>', at +this point all we'd get is an empty (undefined) value back. What we need to +do is to step through the next executable statement with an 'B<s>': + + DB<6> s + main::(./data_a:5): my %data = ( + main::(./data_a:6): 'this' => qw(that), + main::(./data_a:7): 'tom' => qw(and jerry), + main::(./data_a:8): 'welcome' => q(Hello World), + main::(./data_a:9): 'zip' => q(welcome), + main::(./data_a:10): ); + +Now we can have a look at that first ($key) variable: + + DB<7> p $key + welcome + +line 13 is where the action is, so let's continue down to there via the letter +'B<c>', which by the way, inserts a 'one-time-only' breakpoint at the given +line or sub routine: + + DB<8> c 13 + All OK + main::(./data_a:13): print "$data{$key}\n"; + +We've gone past our check (where 'All OK' was printed) and have stopped just +before the meat of our task. We could try to print out a couple of variables +to see what is happening: + + DB<9> p $data{$key} + +Not much in there, lets have a look at our hash: + + DB<10> p %data + Hello Worldziptomandwelcomejerrywelcomethisthat + + DB<11> p keys %data + Hello Worldtomwelcomejerrythis + +Well, this isn't very easy to read, and using the helpful manual (B<h h>), the +'B<x>' command looks promising: + + DB<12> x %data + 0 'Hello World' + 1 'zip' + 2 'tom' + 3 'and' + 4 'welcome' + 5 undef + 6 'jerry' + 7 'welcome' + 8 'this' + 9 'that' + +That's not much help, a couple of welcomes in there, but no indication of +which are keys, and which are values, it's just a listed array dump and, in +this case, not particularly helpful. The trick here, is to use a B<reference> +to the data structure: + + DB<13> x \%data + 0 HASH(0x8194bc4) + 'Hello World' => 'zip' + 'jerry' => 'welcome' + 'this' => 'that' + 'tom' => 'and' + 'welcome' => undef + +The reference is truly dumped and we can finally see what we're dealing with. +Our quoting was perfectly valid but wrong for our purposes, with 'and jerry' +being treated as 2 separate words rather than a phrase, thus throwing the +evenly paired hash structure out of alignment. + +The 'B<-w>' switch would have told us about this, had we used it at the start, +and saved us a lot of trouble: + + > perl -w data + Odd number of elements in hash assignment at ./data line 5. + +We fix our quoting: 'tom' => q(and jerry), and run it again, this time we get +our expected output: + + > perl -w data + Hello World + + +While we're here, take a closer look at the 'B<x>' command, it's really useful +and will merrily dump out nested references, complete objects, partial objects +- just about whatever you throw at it: + +Let's make a quick object and x-plode it, first we'll start the the debugger: +it wants some form of input from STDIN, so we give it something non-commital, +a zero: + + > perl -de 0 + Default die handler restored. + + Loading DB routines from perl5db.pl version 1.07 + Editor support available. + + Enter h or `h h' for help, or `man perldebug' for more help. + + main::(-e:1): 0 + +Now build an on-the-fly object over a couple of lines (note the backslash): + + DB<1> $obj = bless({'unique_id'=>'123', 'attr'=> \ + cont: {'col' => 'black', 'things' => [qw(this that etc)]}}, 'MY_class') + +And let's have a look at it: + + DB<2> x $obj + 0 MY_class=HASH(0x828ad98) + 'attr' => HASH(0x828ad68) + 'col' => 'black' + 'things' => ARRAY(0x828abb8) + 0 'this' + 1 'that' + 2 'etc' + 'unique_id' => 123 + DB<3> + +Useful, huh? You can eval nearly anything in there, and experiment with bits +of code or regexes until the cows come home: + + DB<3> @data = qw(this that the other atheism leather theory scythe) + + DB<4> p 'saw -> '.($cnt += map { print "\t:\t$_\n" } grep(/the/, sort @data)) + atheism + leather + other + scythe + the + theory + saw -> 6 + +If you want to see the command History, type an 'B<H>': + + DB<5> H + 4: p 'saw -> '.($cnt += map { print "\t:\t$_\n" } grep(/the/, sort @data)) + 3: @data = qw(this that the other atheism leather theory scythe) + 2: x $obj + 1: $obj = bless({'unique_id'=>'123', 'attr'=> + {'col' => 'black', 'things' => [qw(this that etc)]}}, 'MY_class') + DB<5> + +And if you want to repeat any previous command, use the exclamation: 'B<!>': + + DB<5> !4 + p 'saw -> '.($cnt += map { print "$_\n" } grep(/the/, sort @data)) + atheism + leather + other + scythe + the + theory + saw -> 12 + +For more on references see L<perlref> and L<perlreftut> + + +=head1 Stepping through code + +Here's a simple program which converts between Celsius and Fahrenheit, it too +has a problem: + + #!/usr/bin/perl -w + use strict; + + my $arg = $ARGV[0] || '-c20'; + + if ($arg =~ /^\-(c|f)((\-|\+)*\d+(\.\d+)*)$/) { + my ($deg, $num) = ($1, $2); + my ($in, $out) = ($num, $num); + if ($deg eq 'c') { + $deg = 'f'; + $out = &c2f($num); + } else { + $deg = 'c'; + $out = &f2c($num); + } + $out = sprintf('%0.2f', $out); + $out =~ s/^((\-|\+)*\d+)\.0+$/$1/; + print "$out $deg\n"; + } else { + print "Usage: $0 -[c|f] num\n"; + } + exit; + + sub f2c { + my $f = shift; + my $c = 5 * $f - 32 / 9; + return $c; + } + + sub c2f { + my $c = shift; + my $f = 9 * $c / 5 + 32; + return $f; + } + + +For some reason, the Fahrenheit to Celsius conversion fails to return the +expected output. This is what it does: + + > temp -c0.72 + 33.30 f + + > temp -f33.3 + 162.94 c + +Not very consistent! We'll set a breakpoint in the code manually and run it +under the debugger to see what's going on. A breakpoint is a flag, to which +the debugger will run without interruption, when it reaches the breakpoint, it +will stop execution and offer a prompt for further interaction. In normal +use, these debugger commands are completely ignored, and they are safe - if a +little messy, to leave in production code. + + my ($in, $out) = ($num, $num); + $DB::single=2; # insert at line 9! + if ($deg eq 'c') + ... + + > perl -d temp -f33.3 + Default die handler restored. + + Loading DB routines from perl5db.pl version 1.07 + Editor support available. + + Enter h or `h h' for help, or `man perldebug' for more help. + + main::(temp:4): my $arg = $ARGV[0] || '-c100'; + +We'll simply continue down to our pre-set breakpoint with a 'B<c>': + + DB<1> c + main::(temp:10): if ($deg eq 'c') { + +Followed by a window command to see where we are: + + DB<1> w + 7: my ($deg, $num) = ($1, $2); + 8: my ($in, $out) = ($num, $num); + 9: $DB::single=2; + 10==> if ($deg eq 'c') { + 11: $deg = 'f'; + 12: $out = &c2f($num); + 13 } else { + 14: $deg = 'c'; + 15: $out = &f2c($num); + 16 } + +And a print to show what values we're currently using: + + DB<1> p $deg, $num + f33.3 + +We can put another break point on any line beginning with a colon, we'll use +line 17 as that's just as we come out of the subroutine, and we'd like to +pause there later on: + + DB<2> b 17 + +There's no feedback from this, but you can see what breakpoints are set by +using the list 'L' command: + + DB<3> L + temp: + 17: print "$out $deg\n"; + break if (1) + +Note that to delete a breakpoint you use 'd' or 'D'. + +Now we'll continue down into our subroutine, this time rather than by line +number, we'll use the subroutine name, followed by the now familiar 'w': + + DB<3> c f2c + main::f2c(temp:30): my $f = shift; + + DB<4> w + 24: exit; + 25 + 26 sub f2c { + 27==> my $f = shift; + 28: my $c = 5 * $f - 32 / 9; + 29: return $c; + 30 } + 31 + 32 sub c2f { + 33: my $c = shift; + + +Note that if there was a subroutine call between us and line 29, and we wanted +to B<single-step> through it, we could use the 'B<s>' command, and to step +over it we would use 'B<n>' which would execute the sub, but not descend into +it for inspection. In this case though, we simply continue down to line 29: + + DB<4> c 29 + main::f2c(temp:29): return $c; + +And have a look at the return value: + + DB<5> p $c + 162.944444444444 + +This is not the right answer at all, but the sum looks correct. I wonder if +it's anything to do with operator precedence? We'll try a couple of other +possibilities with our sum: + + DB<6> p (5 * $f - 32 / 9) + 162.944444444444 + + DB<7> p 5 * $f - (32 / 9) + 162.944444444444 + + DB<8> p (5 * $f) - 32 / 9 + 162.944444444444 + + DB<9> p 5 * ($f - 32) / 9 + 0.722222222222221 + +:-) that's more like it! Ok, now we can set our return variable and we'll +return out of the sub with an 'r': + + DB<10> $c = 5 * ($f - 32) / 9 + + DB<11> r + scalar context return from main::f2c: 0.722222222222221 + +Looks good, let's just continue off the end of the script: + + DB<12> c + 0.72 c + Debugged program terminated. Use q to quit or R to restart, + use O inhibit_exit to avoid stopping after program termination, + h q, h R or h O to get additional info. + +A quick fix to the offending line (insert the missing parentheses) in the +actual program and we're finished. + + +=head1 Placeholder for a, w, t, T + +Actions, watch variables, stack traces etc.: on the TODO list. + + a + + W + + t + + T + + +=head1 REGULAR EXPRESSIONS + +Ever wanted to know what a regex looked like? You'll need perl compiled with +the DEBUGGING flag for this one: + + > perl -Dr -e '/^pe(a)*rl$/i' + Compiling REx `^pe(a)*rl$' + size 17 first at 2 + rarest char + at 0 + 1: BOL(2) + 2: EXACTF <pe>(4) + 4: CURLYN[1] {0,32767}(14) + 6: NOTHING(8) + 8: EXACTF <a>(0) + 12: WHILEM(0) + 13: NOTHING(14) + 14: EXACTF <rl>(16) + 16: EOL(17) + 17: END(0) + floating `'$ at 4..2147483647 (checking floating) stclass `EXACTF <pe>' +anchored(BOL) minlen 4 + Omitting $` $& $' support. + + EXECUTING... + + Freeing REx: `^pe(a)*rl$' + +Did you really want to know? :-) +For more gory details on getting regular expressions to work, have a look at +L<perlre>, L<perlretut>, and to decode the mysterious labels (BOL and CURLYN, +etc. above), see L<perldebguts>. + + +=head1 OUTPUT TIPS + +To get all the output from your error log, and not miss any messages via +helpful operating system buffering, insert a line like this, at the start of +your script: + + $|=1; + +To watch the tail of a dynamically growing logfile, (from the command line): + + tail -f $error_log + +Wrapping all die calls in a handler routine can be useful to see how, and from +where, they're being called, L<perlvar> has more information: + + BEGIN { $SIG{__DIE__} = sub { require Carp; Carp::confess(@_) } } + +Various useful techniques for the redirection of STDOUT and STDERR filehandles +are explained in L<perlopentut> and L<perlfaq8>. + + +=head1 CGI + +Just a quick hint here for all those CGI programmers who can't figure out how +on earth to get past that 'waiting for input' prompt, when running their CGI +script from the command-line, try something like this: + + > perl -d my_cgi.pl -nodebug + +Of course L<CGI> and L<perlfaq9> will tell you more. + + +=head1 GUIs + +The command line interface is tightly integrated with an B<emacs> extension +and there's a B<vi> interface too. + +You don't have to do this all on the command line, though, there are a few GUI +options out there. The nice thing about these is you can wave a mouse over a +variable and a dump of it's data will appear in an appropriate window, or in a +popup balloon, no more tiresome typing of 'x $varname' :-) + +In particular have a hunt around for the following: + +B<ptkdb> perlTK based wrapper for the built-in debugger + +B<ddd> data display debugger + +B<PerlDevKit> and B<PerlBuilder> are NT specific + +NB. (more info on these and others would be appreciated). + + +=head1 SUMMARY + +We've seen how to encourage good coding practices with B<use strict> and +B<-w>. We can run the perl debugger B<perl -d scriptname> to inspect your +data from within the perl debugger with the B<p> and B<x> commands. You can +walk through your code, set breakpoints with B<b> and step through that code +with B<s> or B<n>, continue with B<c> and return from a sub with B<r>. Fairly +intuitive stuff when you get down to it. + +There is of course lots more to find out about, this has just scratched the +surface. The best way to learn more is to use perldoc to find out more about +the language, to read the on-line help (L<perldebug> is probably the next +place to go), and of course, experiment. + + +=head1 SEE ALSO + +L<perldebug>, +L<perldebguts>, +L<perldiag>, +L<dprofpp>, +L<perlrun> + + +=head1 AUTHOR + +Richard Foley <richard@rfi.net> Copyright (c) 2000 + + +=head1 CONTRIBUTORS + +Various people have made helpful suggestions and contributions, in particular: + +Ronald J Kimball <rjk@linguist.dartmouth.edu> + +Hugo van der Sanden <hv@crypt0.demon.co.uk> + +Peter Scott <Peter@PSDT.com> + diff --git a/gnu/usr.bin/perl/pod/perlebcdic.pod b/gnu/usr.bin/perl/pod/perlebcdic.pod new file mode 100644 index 00000000000..12ea2f3ef4b --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlebcdic.pod @@ -0,0 +1,1235 @@ +=head1 NAME + +perlebcdic - Considerations for running Perl on EBCDIC platforms + +=head1 DESCRIPTION + +An exploration of some of the issues facing Perl programmers +on EBCDIC based computers. We do not cover localization, +internationalization, or multi byte character set issues (yet). + +Portions that are still incomplete are marked with XXX. + +=head1 COMMON CHARACTER CODE SETS + +=head2 ASCII + +The American Standard Code for Information Interchange is a set of +integers running from 0 to 127 (decimal) that imply character +interpretation by the display and other system(s) of computers. +The range 0..127 can be covered by setting the bits in a 7-bit binary +digit, hence the set is sometimes referred to as a "7-bit ASCII". +ASCII was described by the American National Standards Institute +document ANSI X3.4-1986. It was also described by ISO 646:1991 +(with localization for currency symbols). The full ASCII set is +given in the table below as the first 128 elements. Languages that +can be written adequately with the characters in ASCII include +English, Hawaiian, Indonesian, Swahili and some Native American +languages. + +There are many character sets that extend the range of integers +from 0..2**7-1 up to 2**8-1, or 8 bit bytes (octets if you prefer). +One common one is the ISO 8859-1 character set. + +=head2 ISO 8859 + +The ISO 8859-$n are a collection of character code sets from the +International Organization for Standardization (ISO) each of which +adds characters to the ASCII set that are typically found in European +languages many of which are based on the Roman, or Latin, alphabet. + +=head2 Latin 1 (ISO 8859-1) + +A particular 8-bit extension to ASCII that includes grave and acute +accented Latin characters. Languages that can employ ISO 8859-1 +include all the languages covered by ASCII as well as Afrikaans, +Albanian, Basque, Catalan, Danish, Faroese, Finnish, Norwegian, +Portugese, Spanish, and Swedish. Dutch is covered albeit without +the ij ligature. French is covered too but without the oe ligature. +German can use ISO 8859-1 but must do so without German-style +quotation marks. This set is based on Western European extensions +to ASCII and is commonly encountered in world wide web work. +In IBM character code set identification terminology ISO 8859-1 is +also known as CCSID 819 (or sometimes 0819 or even 00819). + +=head2 EBCDIC + +The Extended Binary Coded Decimal Interchange Code refers to a +large collection of slightly different single and multi byte +coded character sets that are different from ASCII or ISO 8859-1 +and typically run on host computers. The EBCDIC encodings derive +from 8 bit byte extensions of Hollerith punched card encodings. +The layout on the cards was such that high bits were set for the +upper and lower case alphabet characters [a-z] and [A-Z], but there +were gaps within each latin alphabet range. + +Some IBM EBCDIC character sets may be known by character code set +identification numbers (CCSID numbers) or code page numbers. Leading +zero digits in CCSID numbers within this document are insignificant. +E.g. CCSID 0037 may be referred to as 37 in places. + +=head2 13 variant characters + +Among IBM EBCDIC character code sets there are 13 characters that +are often mapped to different integer values. Those characters +are known as the 13 "variant" characters and are: + + \ [ ] { } ^ ~ ! # | $ @ ` + +=head2 0037 + +Character code set ID 0037 is a mapping of the ASCII plus Latin-1 +characters (i.e. ISO 8859-1) to an EBCDIC set. 0037 is used +in North American English locales on the OS/400 operating system +that runs on AS/400 computers. CCSID 37 differs from ISO 8859-1 +in 237 places, in other words they agree on only 19 code point values. + +=head2 1047 + +Character code set ID 1047 is also a mapping of the ASCII plus +Latin-1 characters (i.e. ISO 8859-1) to an EBCDIC set. 1047 is +used under Unix System Services for OS/390, and OpenEdition for VM/ESA. +CCSID 1047 differs from CCSID 0037 in eight places. + +=head2 POSIX-BC + +The EBCDIC code page in use on Siemens' BS2000 system is distinct from +1047 and 0037. It is identified below as the POSIX-BC set. + +=head1 SINGLE OCTET TABLES + +The following tables list the ASCII and Latin 1 ordered sets including +the subsets: C0 controls (0..31), ASCII graphics (32..7e), delete (7f), +C1 controls (80..9f), and Latin-1 (a.k.a. ISO 8859-1) (a0..ff). In the +table non-printing control character names as well as the Latin 1 +extensions to ASCII have been labelled with character names roughly +corresponding to I<The Unicode Standard, Version 2.0> albeit with +substitutions such as s/LATIN// and s/VULGAR// in all cases, +s/CAPITAL LETTER// in some cases, and s/SMALL LETTER ([A-Z])/\l$1/ +in some other cases (the C<charnames> pragma names unfortunately do +not list explicit names for the C0 or C1 control characters). The +"names" of the C1 control set (128..159 in ISO 8859-1) listed here are +somewhat arbitrary. The differences between the 0037 and 1047 sets are +flagged with ***. The differences between the 1047 and POSIX-BC sets +are flagged with ###. All ord() numbers listed are decimal. If you +would rather see this table listing octal values then run the table +(that is, the pod version of this document since this recipe may not +work with a pod2_other_format translation) through: + +=over 4 + +=item recipe 0 + +=back + + perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ + -e '{printf("%s%-9o%-9o%-9o%-9o\n",$1,$2,$3,$4,$5)}' perlebcdic.pod + +If you would rather see this table listing hexadecimal values then +run the table through: + +=over 4 + +=item recipe 1 + +=back + + perl -ne 'if(/(.{33})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/)' \ + -e '{printf("%s%-9X%-9X%-9X%-9X\n",$1,$2,$3,$4,$5)}' perlebcdic.pod + + + 8859-1 + chr 0819 0037 1047 POSIX-BC + ---------------------------------------------------------------- + <NULL> 0 0 0 0 + <START OF HEADING> 1 1 1 1 + <START OF TEXT> 2 2 2 2 + <END OF TEXT> 3 3 3 3 + <END OF TRANSMISSION> 4 55 55 55 + <ENQUIRY> 5 45 45 45 + <ACKNOWLEDGE> 6 46 46 46 + <BELL> 7 47 47 47 + <BACKSPACE> 8 22 22 22 + <HORIZONTAL TABULATION> 9 5 5 5 + <LINE FEED> 10 37 21 21 *** + <VERTICAL TABULATION> 11 11 11 11 + <FORM FEED> 12 12 12 12 + <CARRIAGE RETURN> 13 13 13 13 + <SHIFT OUT> 14 14 14 14 + <SHIFT IN> 15 15 15 15 + <DATA LINK ESCAPE> 16 16 16 16 + <DEVICE CONTROL ONE> 17 17 17 17 + <DEVICE CONTROL TWO> 18 18 18 18 + <DEVICE CONTROL THREE> 19 19 19 19 + <DEVICE CONTROL FOUR> 20 60 60 60 + <NEGATIVE ACKNOWLEDGE> 21 61 61 61 + <SYNCHRONOUS IDLE> 22 50 50 50 + <END OF TRANSMISSION BLOCK> 23 38 38 38 + <CANCEL> 24 24 24 24 + <END OF MEDIUM> 25 25 25 25 + <SUBSTITUTE> 26 63 63 63 + <ESCAPE> 27 39 39 39 + <FILE SEPARATOR> 28 28 28 28 + <GROUP SEPARATOR> 29 29 29 29 + <RECORD SEPARATOR> 30 30 30 30 + <UNIT SEPARATOR> 31 31 31 31 + <SPACE> 32 64 64 64 + ! 33 90 90 90 + " 34 127 127 127 + # 35 123 123 123 + $ 36 91 91 91 + % 37 108 108 108 + & 38 80 80 80 + ' 39 125 125 125 + ( 40 77 77 77 + ) 41 93 93 93 + * 42 92 92 92 + + 43 78 78 78 + , 44 107 107 107 + - 45 96 96 96 + . 46 75 75 75 + / 47 97 97 97 + 0 48 240 240 240 + 1 49 241 241 241 + 2 50 242 242 242 + 3 51 243 243 243 + 4 52 244 244 244 + 5 53 245 245 245 + 6 54 246 246 246 + 7 55 247 247 247 + 8 56 248 248 248 + 9 57 249 249 249 + : 58 122 122 122 + ; 59 94 94 94 + < 60 76 76 76 + = 61 126 126 126 + > 62 110 110 110 + ? 63 111 111 111 + @ 64 124 124 124 + A 65 193 193 193 + B 66 194 194 194 + C 67 195 195 195 + D 68 196 196 196 + E 69 197 197 197 + F 70 198 198 198 + G 71 199 199 199 + H 72 200 200 200 + I 73 201 201 201 + J 74 209 209 209 + K 75 210 210 210 + L 76 211 211 211 + M 77 212 212 212 + N 78 213 213 213 + O 79 214 214 214 + P 80 215 215 215 + Q 81 216 216 216 + R 82 217 217 217 + S 83 226 226 226 + T 84 227 227 227 + U 85 228 228 228 + V 86 229 229 229 + W 87 230 230 230 + X 88 231 231 231 + Y 89 232 232 232 + Z 90 233 233 233 + [ 91 186 173 187 *** ### + \ 92 224 224 188 ### + ] 93 187 189 189 *** + ^ 94 176 95 106 *** ### + _ 95 109 109 109 + ` 96 121 121 74 ### + a 97 129 129 129 + b 98 130 130 130 + c 99 131 131 131 + d 100 132 132 132 + e 101 133 133 133 + f 102 134 134 134 + g 103 135 135 135 + h 104 136 136 136 + i 105 137 137 137 + j 106 145 145 145 + k 107 146 146 146 + l 108 147 147 147 + m 109 148 148 148 + n 110 149 149 149 + o 111 150 150 150 + p 112 151 151 151 + q 113 152 152 152 + r 114 153 153 153 + s 115 162 162 162 + t 116 163 163 163 + u 117 164 164 164 + v 118 165 165 165 + w 119 166 166 166 + x 120 167 167 167 + y 121 168 168 168 + z 122 169 169 169 + { 123 192 192 251 ### + | 124 79 79 79 + } 125 208 208 253 ### + ~ 126 161 161 255 ### + <DELETE> 127 7 7 7 + <C1 0> 128 32 32 32 + <C1 1> 129 33 33 33 + <C1 2> 130 34 34 34 + <C1 3> 131 35 35 35 + <C1 4> 132 36 36 36 + <C1 5> 133 21 37 37 *** + <C1 6> 134 6 6 6 + <C1 7> 135 23 23 23 + <C1 8> 136 40 40 40 + <C1 9> 137 41 41 41 + <C1 10> 138 42 42 42 + <C1 11> 139 43 43 43 + <C1 12> 140 44 44 44 + <C1 13> 141 9 9 9 + <C1 14> 142 10 10 10 + <C1 15> 143 27 27 27 + <C1 16> 144 48 48 48 + <C1 17> 145 49 49 49 + <C1 18> 146 26 26 26 + <C1 19> 147 51 51 51 + <C1 20> 148 52 52 52 + <C1 21> 149 53 53 53 + <C1 22> 150 54 54 54 + <C1 23> 151 8 8 8 + <C1 24> 152 56 56 56 + <C1 25> 153 57 57 57 + <C1 26> 154 58 58 58 + <C1 27> 155 59 59 59 + <C1 28> 156 4 4 4 + <C1 29> 157 20 20 20 + <C1 30> 158 62 62 62 + <C1 31> 159 255 255 95 ### + <NON-BREAKING SPACE> 160 65 65 65 + <INVERTED EXCLAMATION MARK> 161 170 170 170 + <CENT SIGN> 162 74 74 176 ### + <POUND SIGN> 163 177 177 177 + <CURRENCY SIGN> 164 159 159 159 + <YEN SIGN> 165 178 178 178 + <BROKEN BAR> 166 106 106 208 ### + <SECTION SIGN> 167 181 181 181 + <DIAERESIS> 168 189 187 121 *** ### + <COPYRIGHT SIGN> 169 180 180 180 + <FEMININE ORDINAL INDICATOR> 170 154 154 154 + <LEFT POINTING GUILLEMET> 171 138 138 138 + <NOT SIGN> 172 95 176 186 *** ### + <SOFT HYPHEN> 173 202 202 202 + <REGISTERED TRADE MARK SIGN> 174 175 175 175 + <MACRON> 175 188 188 161 ### + <DEGREE SIGN> 176 144 144 144 + <PLUS-OR-MINUS SIGN> 177 143 143 143 + <SUPERSCRIPT TWO> 178 234 234 234 + <SUPERSCRIPT THREE> 179 250 250 250 + <ACUTE ACCENT> 180 190 190 190 + <MICRO SIGN> 181 160 160 160 + <PARAGRAPH SIGN> 182 182 182 182 + <MIDDLE DOT> 183 179 179 179 + <CEDILLA> 184 157 157 157 + <SUPERSCRIPT ONE> 185 218 218 218 + <MASC. ORDINAL INDICATOR> 186 155 155 155 + <RIGHT POINTING GUILLEMET> 187 139 139 139 + <FRACTION ONE QUARTER> 188 183 183 183 + <FRACTION ONE HALF> 189 184 184 184 + <FRACTION THREE QUARTERS> 190 185 185 185 + <INVERTED QUESTION MARK> 191 171 171 171 + <A WITH GRAVE> 192 100 100 100 + <A WITH ACUTE> 193 101 101 101 + <A WITH CIRCUMFLEX> 194 98 98 98 + <A WITH TILDE> 195 102 102 102 + <A WITH DIAERESIS> 196 99 99 99 + <A WITH RING ABOVE> 197 103 103 103 + <CAPITAL LIGATURE AE> 198 158 158 158 + <C WITH CEDILLA> 199 104 104 104 + <E WITH GRAVE> 200 116 116 116 + <E WITH ACUTE> 201 113 113 113 + <E WITH CIRCUMFLEX> 202 114 114 114 + <E WITH DIAERESIS> 203 115 115 115 + <I WITH GRAVE> 204 120 120 120 + <I WITH ACUTE> 205 117 117 117 + <I WITH CIRCUMFLEX> 206 118 118 118 + <I WITH DIAERESIS> 207 119 119 119 + <CAPITAL LETTER ETH> 208 172 172 172 + <N WITH TILDE> 209 105 105 105 + <O WITH GRAVE> 210 237 237 237 + <O WITH ACUTE> 211 238 238 238 + <O WITH CIRCUMFLEX> 212 235 235 235 + <O WITH TILDE> 213 239 239 239 + <O WITH DIAERESIS> 214 236 236 236 + <MULTIPLICATION SIGN> 215 191 191 191 + <O WITH STROKE> 216 128 128 128 + <U WITH GRAVE> 217 253 253 224 ### + <U WITH ACUTE> 218 254 254 254 + <U WITH CIRCUMFLEX> 219 251 251 221 ### + <U WITH DIAERESIS> 220 252 252 252 + <Y WITH ACUTE> 221 173 186 173 *** ### + <CAPITAL LETTER THORN> 222 174 174 174 + <SMALL LETTER SHARP S> 223 89 89 89 + <a WITH GRAVE> 224 68 68 68 + <a WITH ACUTE> 225 69 69 69 + <a WITH CIRCUMFLEX> 226 66 66 66 + <a WITH TILDE> 227 70 70 70 + <a WITH DIAERESIS> 228 67 67 67 + <a WITH RING ABOVE> 229 71 71 71 + <SMALL LIGATURE ae> 230 156 156 156 + <c WITH CEDILLA> 231 72 72 72 + <e WITH GRAVE> 232 84 84 84 + <e WITH ACUTE> 233 81 81 81 + <e WITH CIRCUMFLEX> 234 82 82 82 + <e WITH DIAERESIS> 235 83 83 83 + <i WITH GRAVE> 236 88 88 88 + <i WITH ACUTE> 237 85 85 85 + <i WITH CIRCUMFLEX> 238 86 86 86 + <i WITH DIAERESIS> 239 87 87 87 + <SMALL LETTER eth> 240 140 140 140 + <n WITH TILDE> 241 73 73 73 + <o WITH GRAVE> 242 205 205 205 + <o WITH ACUTE> 243 206 206 206 + <o WITH CIRCUMFLEX> 244 203 203 203 + <o WITH TILDE> 245 207 207 207 + <o WITH DIAERESIS> 246 204 204 204 + <DIVISION SIGN> 247 225 225 225 + <o WITH STROKE> 248 112 112 112 + <u WITH GRAVE> 249 221 221 192 ### + <u WITH ACUTE> 250 222 222 222 + <u WITH CIRCUMFLEX> 251 219 219 219 + <u WITH DIAERESIS> 252 220 220 220 + <y WITH ACUTE> 253 141 141 141 + <SMALL LETTER thorn> 254 142 142 142 + <y WITH DIAERESIS> 255 223 223 223 + +If you would rather see the above table in CCSID 0037 order rather than +ASCII + Latin-1 order then run the table through: + +=over 4 + +=item recipe 2 + +=back + + perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,42,3)]}@l;}' perlebcdic.pod + +If you would rather see it in CCSID 1047 order then change the digit +42 in the last line to 51, like this: + +=over 4 + +=item recipe 3 + +=back + + perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,51,3)]}@l;}' perlebcdic.pod + +If you would rather see it in POSIX-BC order then change the digit +51 in the last line to 60, like this: + +=over 4 + +=item recipe 4 + +=back + + perl -ne 'if(/.{33}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}\s{6,8}\d{1,3}/)'\ + -e '{push(@l,$_)}' \ + -e 'END{print map{$_->[0]}' \ + -e ' sort{$a->[1] <=> $b->[1]}' \ + -e ' map{[$_,substr($_,60,3)]}@l;}' perlebcdic.pod + + +=head1 IDENTIFYING CHARACTER CODE SETS + +To determine the character set you are running under from perl one +could use the return value of ord() or chr() to test one or more +character values. For example: + + $is_ascii = "A" eq chr(65); + $is_ebcdic = "A" eq chr(193); + +Also, "\t" is a C<HORIZONTAL TABULATION> character so that: + + $is_ascii = ord("\t") == 9; + $is_ebcdic = ord("\t") == 5; + +To distinguish EBCDIC code pages try looking at one or more of +the characters that differ between them. For example: + + $is_ebcdic_37 = "\n" eq chr(37); + $is_ebcdic_1047 = "\n" eq chr(21); + +Or better still choose a character that is uniquely encoded in any +of the code sets, e.g.: + + $is_ascii = ord('[') == 91; + $is_ebcdic_37 = ord('[') == 186; + $is_ebcdic_1047 = ord('[') == 173; + $is_ebcdic_POSIX_BC = ord('[') == 187; + +However, it would be unwise to write tests such as: + + $is_ascii = "\r" ne chr(13); # WRONG + $is_ascii = "\n" ne chr(10); # ILL ADVISED + +Obviously the first of these will fail to distinguish most ASCII machines +from either a CCSID 0037, a 1047, or a POSIX-BC EBCDIC machine since "\r" eq +chr(13) under all of those coded character sets. But note too that +because "\n" is chr(13) and "\r" is chr(10) on the MacIntosh (which is an +ASCII machine) the second C<$is_ascii> test will lead to trouble there. + +To determine whether or not perl was built under an EBCDIC +code page you can use the Config module like so: + + use Config; + $is_ebcdic = $Config{'ebcdic'} eq 'define'; + +=head1 CONVERSIONS + +=head2 tr/// + +In order to convert a string of characters from one character set to +another a simple list of numbers, such as in the right columns in the +above table, along with perl's tr/// operator is all that is needed. +The data in the table are in ASCII order hence the EBCDIC columns +provide easy to use ASCII to EBCDIC operations that are also easily +reversed. + +For example, to convert ASCII to code page 037 take the output of the second +column from the output of recipe 0 (modified to add \\ characters) and use +it in tr/// like so: + + $cp_037 = + '\000\001\002\003\234\011\206\177\227\215\216\013\014\015\016\017' . + '\020\021\022\023\235\205\010\207\030\031\222\217\034\035\036\037' . + '\200\201\202\203\204\012\027\033\210\211\212\213\214\005\006\007' . + '\220\221\026\223\224\225\226\004\230\231\232\233\024\025\236\032' . + '\040\240\342\344\340\341\343\345\347\361\242\056\074\050\053\174' . + '\046\351\352\353\350\355\356\357\354\337\041\044\052\051\073\254' . + '\055\057\302\304\300\301\303\305\307\321\246\054\045\137\076\077' . + '\370\311\312\313\310\315\316\317\314\140\072\043\100\047\075\042' . + '\330\141\142\143\144\145\146\147\150\151\253\273\360\375\376\261' . + '\260\152\153\154\155\156\157\160\161\162\252\272\346\270\306\244' . + '\265\176\163\164\165\166\167\170\171\172\241\277\320\335\336\256' . + '\136\243\245\267\251\247\266\274\275\276\133\135\257\250\264\327' . + '\173\101\102\103\104\105\106\107\110\111\255\364\366\362\363\365' . + '\175\112\113\114\115\116\117\120\121\122\271\373\374\371\372\377' . + '\134\367\123\124\125\126\127\130\131\132\262\324\326\322\323\325' . + '\060\061\062\063\064\065\066\067\070\071\263\333\334\331\332\237' ; + + my $ebcdic_string = $ascii_string; + eval '$ebcdic_string =~ tr/\000-\377/' . $cp_037 . '/'; + +To convert from EBCDIC 037 to ASCII just reverse the order of the tr/// +arguments like so: + + my $ascii_string = $ebcdic_string; + eval '$ascii_string = tr/' . $cp_037 . '/\000-\377/'; + +Similarly one could take the output of the third column from recipe 0 to +obtain a C<$cp_1047> table. The fourth column of the output from recipe +0 could provide a C<$cp_posix_bc> table suitable for transcoding as well. + +=head2 iconv + +XPG operability often implies the presence of an I<iconv> utility +available from the shell or from the C library. Consult your system's +documentation for information on iconv. + +On OS/390 see the iconv(1) man page. One way to invoke the iconv +shell utility from within perl would be to: + + # OS/390 example + $ascii_data = `echo '$ebcdic_data'| iconv -f IBM-1047 -t ISO8859-1` + +or the inverse map: + + # OS/390 example + $ebcdic_data = `echo '$ascii_data'| iconv -f ISO8859-1 -t IBM-1047` + +For other perl based conversion options see the Convert::* modules on CPAN. + +=head2 C RTL + +The OS/390 C run time library provides _atoe() and _etoa() functions. + +=head1 OPERATOR DIFFERENCES + +The C<..> range operator treats certain character ranges with +care on EBCDIC machines. For example the following array +will have twenty six elements on either an EBCDIC machine +or an ASCII machine: + + @alphabet = ('A'..'Z'); # $#alphabet == 25 + +The bitwise operators such as & ^ | may return different results +when operating on string or character data in a perl program running +on an EBCDIC machine than when run on an ASCII machine. Here is +an example adapted from the one in L<perlop>: + + # EBCDIC-based examples + print "j p \n" ^ " a h"; # prints "JAPH\n" + print "JA" | " ph\n"; # prints "japh\n" + print "JAPH\nJunk" & "\277\277\277\277\277"; # prints "japh\n"; + print 'p N$' ^ " E<H\n"; # prints "Perl\n"; + +An interesting property of the 32 C0 control characters +in the ASCII table is that they can "literally" be constructed +as control characters in perl, e.g. C<(chr(0) eq "\c@")> +C<(chr(1) eq "\cA")>, and so on. Perl on EBCDIC machines has been +ported to take "\c@" to chr(0) and "\cA" to chr(1) as well, but the +thirty three characters that result depend on which code page you are +using. The table below uses the character names from the previous table +but with substitutions such as s/START OF/S.O./; s/END OF /E.O./; +s/TRANSMISSION/TRANS./; s/TABULATION/TAB./; s/VERTICAL/VERT./; +s/HORIZONTAL/HORIZ./; s/DEVICE CONTROL/D.C./; s/SEPARATOR/SEP./; +s/NEGATIVE ACKNOWLEDGE/NEG. ACK./;. The POSIX-BC and 1047 sets are +identical throughout this range and differ from the 0037 set at only +one spot (21 decimal). Note that the C<LINE FEED> character +may be generated by "\cJ" on ASCII machines but by "\cU" on 1047 or POSIX-BC +machines and cannot be generated as a C<"\c.letter."> control character on +0037 machines. Note also that "\c\\" maps to two characters +not one. + + chr ord 8859-1 0037 1047 && POSIX-BC + ------------------------------------------------------------------------ + "\c?" 127 <DELETE> " " ***>< + "\c@" 0 <NULL> <NULL> <NULL> ***>< + "\cA" 1 <S.O. HEADING> <S.O. HEADING> <S.O. HEADING> + "\cB" 2 <S.O. TEXT> <S.O. TEXT> <S.O. TEXT> + "\cC" 3 <E.O. TEXT> <E.O. TEXT> <E.O. TEXT> + "\cD" 4 <E.O. TRANS.> <C1 28> <C1 28> + "\cE" 5 <ENQUIRY> <HORIZ. TAB.> <HORIZ. TAB.> + "\cF" 6 <ACKNOWLEDGE> <C1 6> <C1 6> + "\cG" 7 <BELL> <DELETE> <DELETE> + "\cH" 8 <BACKSPACE> <C1 23> <C1 23> + "\cI" 9 <HORIZ. TAB.> <C1 13> <C1 13> + "\cJ" 10 <LINE FEED> <C1 14> <C1 14> + "\cK" 11 <VERT. TAB.> <VERT. TAB.> <VERT. TAB.> + "\cL" 12 <FORM FEED> <FORM FEED> <FORM FEED> + "\cM" 13 <CARRIAGE RETURN> <CARRIAGE RETURN> <CARRIAGE RETURN> + "\cN" 14 <SHIFT OUT> <SHIFT OUT> <SHIFT OUT> + "\cO" 15 <SHIFT IN> <SHIFT IN> <SHIFT IN> + "\cP" 16 <DATA LINK ESCAPE> <DATA LINK ESCAPE> <DATA LINK ESCAPE> + "\cQ" 17 <D.C. ONE> <D.C. ONE> <D.C. ONE> + "\cR" 18 <D.C. TWO> <D.C. TWO> <D.C. TWO> + "\cS" 19 <D.C. THREE> <D.C. THREE> <D.C. THREE> + "\cT" 20 <D.C. FOUR> <C1 29> <C1 29> + "\cU" 21 <NEG. ACK.> <C1 5> <LINE FEED> *** + "\cV" 22 <SYNCHRONOUS IDLE> <BACKSPACE> <BACKSPACE> + "\cW" 23 <E.O. TRANS. BLOCK> <C1 7> <C1 7> + "\cX" 24 <CANCEL> <CANCEL> <CANCEL> + "\cY" 25 <E.O. MEDIUM> <E.O. MEDIUM> <E.O. MEDIUM> + "\cZ" 26 <SUBSTITUTE> <C1 18> <C1 18> + "\c[" 27 <ESCAPE> <C1 15> <C1 15> + "\c\\" 28 <FILE SEP.>\ <FILE SEP.>\ <FILE SEP.>\ + "\c]" 29 <GROUP SEP.> <GROUP SEP.> <GROUP SEP.> + "\c^" 30 <RECORD SEP.> <RECORD SEP.> <RECORD SEP.> ***>< + "\c_" 31 <UNIT SEP.> <UNIT SEP.> <UNIT SEP.> ***>< + + +=head1 FUNCTION DIFFERENCES + +=over 8 + +=item chr() + +chr() must be given an EBCDIC code number argument to yield a desired +character return value on an EBCDIC machine. For example: + + $CAPITAL_LETTER_A = chr(193); + +=item ord() + +ord() will return EBCDIC code number values on an EBCDIC machine. +For example: + + $the_number_193 = ord("A"); + +=item pack() + +The c and C templates for pack() are dependent upon character set +encoding. Examples of usage on EBCDIC include: + + $foo = pack("CCCC",193,194,195,196); + # $foo eq "ABCD" + $foo = pack("C4",193,194,195,196); + # same thing + + $foo = pack("ccxxcc",193,194,195,196); + # $foo eq "AB\0\0CD" + +=item print() + +One must be careful with scalars and strings that are passed to +print that contain ASCII encodings. One common place +for this to occur is in the output of the MIME type header for +CGI script writing. For example, many perl programming guides +recommend something similar to: + + print "Content-type:\ttext/html\015\012\015\012"; + # this may be wrong on EBCDIC + +Under the IBM OS/390 USS Web Server for example you should instead +write that as: + + print "Content-type:\ttext/html\r\n\r\n"; # OK for DGW et alia + +That is because the translation from EBCDIC to ASCII is done +by the web server in this case (such code will not be appropriate for +the Macintosh however). Consult your web server's documentation for +further details. + +=item printf() + +The formats that can convert characters to numbers and vice versa +will be different from their ASCII counterparts when executed +on an EBCDIC machine. Examples include: + + printf("%c%c%c",193,194,195); # prints ABC + +=item sort() + +EBCDIC sort results may differ from ASCII sort results especially for +mixed case strings. This is discussed in more detail below. + +=item sprintf() + +See the discussion of printf() above. An example of the use +of sprintf would be: + + $CAPITAL_LETTER_A = sprintf("%c",193); + +=item unpack() + +See the discussion of pack() above. + +=back + +=head1 REGULAR EXPRESSION DIFFERENCES + +As of perl 5.005_03 the letter range regular expression such as +[A-Z] and [a-z] have been especially coded to not pick up gap +characters. For example, characters such as E<ocirc> C<o WITH CIRCUMFLEX> +that lie between I and J would not be matched by the +regular expression range C</[H-K]/>. + +If you do want to match the alphabet gap characters in a single octet +regular expression try matching the hex or octal code such +as C</\313/> on EBCDIC or C</\364/> on ASCII machines to +have your regular expression match C<o WITH CIRCUMFLEX>. + +Another construct to be wary of is the inappropriate use of hex or +octal constants in regular expressions. Consider the following +set of subs: + + sub is_c0 { + my $char = substr(shift,0,1); + $char =~ /[\000-\037]/; + } + + sub is_print_ascii { + my $char = substr(shift,0,1); + $char =~ /[\040-\176]/; + } + + sub is_delete { + my $char = substr(shift,0,1); + $char eq "\177"; + } + + sub is_c1 { + my $char = substr(shift,0,1); + $char =~ /[\200-\237]/; + } + + sub is_latin_1 { + my $char = substr(shift,0,1); + $char =~ /[\240-\377]/; + } + +The above would be adequate if the concern was only with numeric code points. +However, the concern may be with characters rather than code points +and on an EBCDIC machine it may be desirable for constructs such as +C<if (is_print_ascii("A")) {print "A is a printable character\n";}> to print +out the expected message. One way to represent the above collection +of character classification subs that is capable of working across the +four coded character sets discussed in this document is as follows: + + sub Is_c0 { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char =~ /[\000-\037]/; + } + if (ord('^')==176) { # 37 + return $char =~ /[\000-\003\067\055-\057\026\005\045\013-\023\074\075\062\046\030\031\077\047\034-\037]/; + } + if (ord('^')==95 || ord('^')==106) { # 1047 || posix-bc + return $char =~ /[\000-\003\067\055-\057\026\005\025\013-\023\074\075\062\046\030\031\077\047\034-\037]/; + } + } + + sub Is_print_ascii { + my $char = substr(shift,0,1); + $char =~ /[ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/; + } + + sub Is_delete { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char eq "\177"; + } + else { # ebcdic + return $char eq "\007"; + } + } + + sub Is_c1 { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char =~ /[\200-\237]/; + } + if (ord('^')==176) { # 37 + return $char =~ /[\040-\044\025\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/; + } + if (ord('^')==95) { # 1047 + return $char =~ /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\377]/; + } + if (ord('^')==106) { # posix-bc + return $char =~ + /[\040-\045\006\027\050-\054\011\012\033\060\061\032\063-\066\010\070-\073\040\024\076\137]/; + } + } + + sub Is_latin_1 { + my $char = substr(shift,0,1); + if (ord('^')==94) { # ascii + return $char =~ /[\240-\377]/; + } + if (ord('^')==176) { # 37 + return $char =~ + /[\101\252\112\261\237\262\152\265\275\264\232\212\137\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/; + } + if (ord('^')==95) { # 1047 + return $char =~ + /[\101\252\112\261\237\262\152\265\273\264\232\212\260\312\257\274\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\375\376\373\374\272\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\335\336\333\334\215\216\337]/; + } + if (ord('^')==106) { # posix-bc + return $char =~ + /[\101\252\260\261\237\262\320\265\171\264\232\212\272\312\257\241\220\217\352\372\276\240\266\263\235\332\233\213\267\270\271\253\144\145\142\146\143\147\236\150\164\161-\163\170\165-\167\254\151\355\356\353\357\354\277\200\340\376\335\374\255\256\131\104\105\102\106\103\107\234\110\124\121-\123\130\125-\127\214\111\315\316\313\317\314\341\160\300\336\333\334\215\216\337]/; + } + } + +Note however that only the C<Is_ascii_print()> sub is really independent +of coded character set. Another way to write C<Is_latin_1()> would be +to use the characters in the range explicitly: + + sub Is_latin_1 { + my $char = substr(shift,0,1); + $char =~ /[ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ]/; + } + +Although that form may run into trouble in network transit (due to the +presence of 8 bit characters) or on non ISO-Latin character sets. + +=head1 SOCKETS + +Most socket programming assumes ASCII character encodings in network +byte order. Exceptions can include CGI script writing under a +host web server where the server may take care of translation for you. +Most host web servers convert EBCDIC data to ISO-8859-1 or Unicode on +output. + +=head1 SORTING + +One big difference between ASCII based character sets and EBCDIC ones +are the relative positions of upper and lower case letters and the +letters compared to the digits. If sorted on an ASCII based machine the +two letter abbreviation for a physician comes before the two letter +for drive, that is: + + @sorted = sort(qw(Dr. dr.)); # @sorted holds ('Dr.','dr.') on ASCII, + # but ('dr.','Dr.') on EBCDIC + +The property of lower case before uppercase letters in EBCDIC is +even carried to the Latin 1 EBCDIC pages such as 0037 and 1047. +An example would be that E<Euml> C<E WITH DIAERESIS> (203) comes +before E<euml> C<e WITH DIAERESIS> (235) on an ASCII machine, but +the latter (83) comes before the former (115) on an EBCDIC machine. +(Astute readers will note that the upper case version of E<szlig> +C<SMALL LETTER SHARP S> is simply "SS" and that the upper case version of +E<yuml> C<y WITH DIAERESIS> is not in the 0..255 range but it is +at U+x0178 in Unicode, or C<"\x{178}"> in a Unicode enabled Perl). + +The sort order will cause differences between results obtained on +ASCII machines versus EBCDIC machines. What follows are some suggestions +on how to deal with these differences. + +=head2 Ignore ASCII vs. EBCDIC sort differences. + +This is the least computationally expensive strategy. It may require +some user education. + +=head2 MONO CASE then sort data. + +In order to minimize the expense of mono casing mixed test try to +C<tr///> towards the character set case most employed within the data. +If the data are primarily UPPERCASE non Latin 1 then apply tr/[a-z]/[A-Z]/ +then sort(). If the data are primarily lowercase non Latin 1 then +apply tr/[A-Z]/[a-z]/ before sorting. If the data are primarily UPPERCASE +and include Latin-1 characters then apply: + + tr/[a-z]/[A-Z]/; + tr/[àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ]/[ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ]/; + s/ß/SS/g; + +then sort(). Do note however that such Latin-1 manipulation does not +address the E<yuml> C<y WITH DIAERESIS> character that will remain at +code point 255 on ASCII machines, but 223 on most EBCDIC machines +where it will sort to a place less than the EBCDIC numerals. With a +Unicode enabled Perl you might try: + + tr/^?/\x{178}/; + +The strategy of mono casing data before sorting does not preserve the case +of the data and may not be acceptable for that reason. + +=head2 Convert, sort data, then re convert. + +This is the most expensive proposition that does not employ a network +connection. + +=head2 Perform sorting on one type of machine only. + +This strategy can employ a network connection. As such +it would be computationally expensive. + +=head1 TRANFORMATION FORMATS + +There are a variety of ways of transforming data with an intra character set +mapping that serve a variety of purposes. Sorting was discussed in the +previous section and a few of the other more popular mapping techniques are +discussed next. + +=head2 URL decoding and encoding + +Note that some URLs have hexadecimal ASCII code points in them in an +attempt to overcome character or protocol limitation issues. For example +the tilde character is not on every keyboard hence a URL of the form: + + http://www.pvhp.com/~pvhp/ + +may also be expressed as either of: + + http://www.pvhp.com/%7Epvhp/ + + http://www.pvhp.com/%7epvhp/ + +where 7E is the hexadecimal ASCII code point for '~'. Here is an example +of decoding such a URL under CCSID 1047: + + $url = 'http://www.pvhp.com/%7Epvhp/'; + # this array assumes code page 1047 + my @a2e_1047 = ( + 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 21, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 60, 61, 50, 38, 24, 25, 63, 39, 28, 29, 30, 31, + 64, 90,127,123, 91,108, 80,125, 77, 93, 92, 78,107, 96, 75, 97, + 240,241,242,243,244,245,246,247,248,249,122, 94, 76,126,110,111, + 124,193,194,195,196,197,198,199,200,201,209,210,211,212,213,214, + 215,216,217,226,227,228,229,230,231,232,233,173,224,189, 95,109, + 121,129,130,131,132,133,134,135,136,137,145,146,147,148,149,150, + 151,152,153,162,163,164,165,166,167,168,169,192, 79,208,161, 7, + 32, 33, 34, 35, 36, 37, 6, 23, 40, 41, 42, 43, 44, 9, 10, 27, + 48, 49, 26, 51, 52, 53, 54, 8, 56, 57, 58, 59, 4, 20, 62,255, + 65,170, 74,177,159,178,106,181,187,180,154,138,176,202,175,188, + 144,143,234,250,190,160,182,179,157,218,155,139,183,184,185,171, + 100,101, 98,102, 99,103,158,104,116,113,114,115,120,117,118,119, + 172,105,237,238,235,239,236,191,128,253,254,251,252,186,174, 89, + 68, 69, 66, 70, 67, 71,156, 72, 84, 81, 82, 83, 88, 85, 86, 87, + 140, 73,205,206,203,207,204,225,112,221,222,219,220,141,142,223 + ); + $url =~ s/%([0-9a-fA-F]{2})/pack("c",$a2e_1047[hex($1)])/ge; + +Conversely, here is a partial solution for the task of encoding such +a URL under the 1047 code page: + + $url = 'http://www.pvhp.com/~pvhp/'; + # this array assumes code page 1047 + my @e2a_1047 = ( + 0, 1, 2, 3,156, 9,134,127,151,141,142, 11, 12, 13, 14, 15, + 16, 17, 18, 19,157, 10, 8,135, 24, 25,146,143, 28, 29, 30, 31, + 128,129,130,131,132,133, 23, 27,136,137,138,139,140, 5, 6, 7, + 144,145, 22,147,148,149,150, 4,152,153,154,155, 20, 21,158, 26, + 32,160,226,228,224,225,227,229,231,241,162, 46, 60, 40, 43,124, + 38,233,234,235,232,237,238,239,236,223, 33, 36, 42, 41, 59, 94, + 45, 47,194,196,192,193,195,197,199,209,166, 44, 37, 95, 62, 63, + 248,201,202,203,200,205,206,207,204, 96, 58, 35, 64, 39, 61, 34, + 216, 97, 98, 99,100,101,102,103,104,105,171,187,240,253,254,177, + 176,106,107,108,109,110,111,112,113,114,170,186,230,184,198,164, + 181,126,115,116,117,118,119,120,121,122,161,191,208, 91,222,174, + 172,163,165,183,169,167,182,188,189,190,221,168,175, 93,180,215, + 123, 65, 66, 67, 68, 69, 70, 71, 72, 73,173,244,246,242,243,245, + 125, 74, 75, 76, 77, 78, 79, 80, 81, 82,185,251,252,249,250,255, + 92,247, 83, 84, 85, 86, 87, 88, 89, 90,178,212,214,210,211,213, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,179,219,220,217,218,159 + ); + # The following regular expression does not address the + # mappings for: ('.' => '%2E', '/' => '%2F', ':' => '%3A') + $url =~ s/([\t "#%&\(\),;<=>\?\@\[\\\]^`{|}~])/sprintf("%%%02X",$e2a_1047[ord($1)])/ge; + +where a more complete solution would split the URL into components +and apply a full s/// substitution only to the appropriate parts. + +In the remaining examples a @e2a or @a2e array may be employed +but the assignment will not be shown explicitly. For code page 1047 +you could use the @a2e_1047 or @e2a_1047 arrays just shown. + +=head2 uu encoding and decoding + +The C<u> template to pack() or unpack() will render EBCDIC data in EBCDIC +characters equivalent to their ASCII counterparts. For example, the +following will print "Yes indeed\n" on either an ASCII or EBCDIC computer: + + $all_byte_chrs = ''; + for (0..255) { $all_byte_chrs .= chr($_); } + $uuencode_byte_chrs = pack('u', $all_byte_chrs); + ($uu = <<' ENDOFHEREDOC') =~ s/^\s*//gm; + M``$"`P0%!@<("0H+#`T.#Q`1$A,4%187&!D:&QP='A\@(2(C)"4F)R@I*BLL + M+2XO,#$R,S0U-C<X.3H[/#T^/T!!0D-$149'2$E*2TQ-3D]045)35%565UA9 + M6EM<75Y?8&%B8V1E9F=H:6IK;&UN;W!Q<G-T=79W>'EZ>WQ]?G^`@8*#A(6& + MAXB)BHN,C8Z/D)&2DY25EI>8F9J;G)V>GZ"AHJ.DI::GJ*FJJZRMKJ^PL;*S + MM+6VM[BYNKN\O;Z_P,'"P\3%QL?(R<K+S,W.S]#1TM/4U=;7V-G:V]S=WM_@ + ?X>+CY.7FY^CIZNOL[>[O\/'R\_3U]O?X^?K[_/W^_P`` + ENDOFHEREDOC + if ($uuencode_byte_chrs eq $uu) { + print "Yes "; + } + $uudecode_byte_chrs = unpack('u', $uuencode_byte_chrs); + if ($uudecode_byte_chrs eq $all_byte_chrs) { + print "indeed\n"; + } + +Here is a very spartan uudecoder that will work on EBCDIC provided +that the @e2a array is filled in appropriately: + + #!/usr/local/bin/perl + @e2a = ( # this must be filled in + ); + $_ = <> until ($mode,$file) = /^begin\s*(\d*)\s*(\S*)/; + open(OUT, "> $file") if $file ne ""; + while(<>) { + last if /^end/; + next if /[a-z]/; + next unless int(((($e2a[ord()] - 32 ) & 077) + 2) / 3) == + int(length() / 4); + print OUT unpack("u", $_); + } + close(OUT); + chmod oct($mode), $file; + + +=head2 Quoted-Printable encoding and decoding + +On ASCII encoded machines it is possible to strip characters outside of +the printable set using: + + # This QP encoder works on ASCII only + $qp_string =~ s/([=\x00-\x1F\x80-\xFF])/sprintf("=%02X",ord($1))/ge; + +Whereas a QP encoder that works on both ASCII and EBCDIC machines +would look somewhat like the following (where the EBCDIC branch @e2a +array is omitted for brevity): + + if (ord('A') == 65) { # ASCII + $delete = "\x7F"; # ASCII + @e2a = (0 .. 255) # ASCII to ASCII identity map + } + else { # EBCDIC + $delete = "\x07"; # EBCDIC + @e2a = # EBCDIC to ASCII map (as shown above) + } + $qp_string =~ + s/([^ !"\#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~$delete])/sprintf("=%02X",$e2a[ord($1)])/ge; + +(although in production code the substitutions might be done +in the EBCDIC branch with the @e2a array and separately in the +ASCII branch without the expense of the identity map). + +Such QP strings can be decoded with: + + # This QP decoder is limited to ASCII only + $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge; + $string =~ s/=[\n\r]+$//; + +Whereas a QP decoder that works on both ASCII and EBCDIC machines +would look somewhat like the following (where the @a2e array is +omitted for brevity): + + $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr $a2e[hex $1]/ge; + $string =~ s/=[\n\r]+$//; + +=head2 Caesarian cyphers + +The practice of shifting an alphabet one or more characters for encipherment +dates back thousands of years and was explicitly detailed by Gaius Julius +Caesar in his B<Gallic Wars> text. A single alphabet shift is sometimes +referred to as a rotation and the shift amount is given as a number $n after +the string 'rot' or "rot$n". Rot0 and rot26 would designate identity maps +on the 26 letter English version of the Latin alphabet. Rot13 has the +interesting property that alternate subsequent invocations are identity maps +(thus rot13 is its own non-trivial inverse in the group of 26 alphabet +rotations). Hence the following is a rot13 encoder and decoder that will +work on ASCII and EBCDIC machines: + + #!/usr/local/bin/perl + + while(<>){ + tr/n-za-mN-ZA-M/a-zA-Z/; + print; + } + +In one-liner form: + + perl -ne 'tr/n-za-mN-ZA-M/a-zA-Z/;print' + + +=head1 Hashing order and checksums + +XXX + +=head1 I18N AND L10N + +Internationalization(I18N) and localization(L10N) are supported at least +in principle even on EBCDIC machines. The details are system dependent +and discussed under the L<perlebcdic/OS ISSUES> section below. + +=head1 MULTI OCTET CHARACTER SETS + +Multi byte EBCDIC code pages; Unicode, UTF-8, UTF-EBCDIC, XXX. + +=head1 OS ISSUES + +There may be a few system dependent issues +of concern to EBCDIC Perl programmers. + +=head2 OS/400 + +The PASE environment. + +=over 8 + +=item IFS access + +XXX. + +=back + +=head2 OS/390 + +Perl runs under Unix Systems Services or USS. + +=over 8 + +=item chcp + +B<chcp> is supported as a shell utility for displaying and changing +one's code page. See also L<chcp>. + +=item dataset access + +For sequential data set access try: + + my @ds_records = `cat //DSNAME`; + +or: + + my @ds_records = `cat //'HLQ.DSNAME'`; + +See also the OS390::Stdio module on CPAN. + +=item OS/390 iconv + +B<iconv> is supported as both a shell utility and a C RTL routine. +See also the iconv(1) and iconv(3) manual pages. + +=item locales + +On OS/390 see L<locale> for information on locales. The L10N files +are in F</usr/nls/locale>. $Config{d_setlocale} is 'define' on OS/390. + +=back + +=head2 VM/ESA? + +XXX. + +=head2 POSIX-BC? + +XXX. + +=head1 BUGS + +This pod document contains literal Latin 1 characters and may encounter +translation difficulties. In particular one popular nroff implementation +was known to strip accented characters to their unaccented counterparts +while attempting to view this document through the B<pod2man> program +(for example, you may see a plain C<y> rather than one with a diaeresis +as in E<yuml>). Another nroff truncated the resultant man page at +the first occurence of 8 bit characters. + +Not all shells will allow multiple C<-e> string arguments to perl to +be concatenated together properly as recipes 2, 3, and 4 might seem +to imply. + +Perl does not yet work with any Unicode features on EBCDIC platforms. + +=head1 SEE ALSO + +L<perllocale>, L<perlfunc>. + +=head1 REFERENCES + +http://anubis.dkuug.dk/i18n/charmaps + +http://www.unicode.org/ + +http://www.unicode.org/unicode/reports/tr16/ + +http://www.wps.com/texts/codes/ +B<ASCII: American Standard Code for Information Infiltration> Tom Jennings, +September 1999. + +B<The Unicode Standard Version 2.0> The Unicode Consortium, +ISBN 0-201-48345-9, Addison Wesley Developers Press, July 1996. + +B<The Unicode Standard Version 3.0> The Unicode Consortium, Lisa Moore ed., +ISBN 0-201-61633-5, Addison Wesley Developers Press, February 2000. + +B<CDRA: IBM - Character Data Representation Architecture - +Reference and Registry>, IBM SC09-2190-00, December 1996. + +"Demystifying Character Sets", Andrea Vine, Multilingual Computing +& Technology, B<#26 Vol. 10 Issue 4>, August/September 1999; +ISSN 1523-0309; Multilingual Computing Inc. Sandpoint ID, USA. + +B<Codes, Ciphers, and Other Cryptic and Clandestine Communication> +Fred B. Wrixon, ISBN 1-57912-040-7, Black Dog & Leventhal Publishers, +1998. + +=head1 AUTHOR + +Peter Prymmer pvhp@best.com wrote this in 1999 and 2000 +with CCSID 0819 and 0037 help from Chris Leach and +AndrE<eacute> Pirard A.Pirard@ulg.ac.be as well as POSIX-BC +help from Thomas Dorner Thomas.Dorner@start.de. +Thanks also to Vickie Cooper, Philip Newton, William Raffloer, and +Joe Smith. Trademarks, registered trademarks, service marks and +registered service marks used in this document are the property of +their respective owners. + + diff --git a/gnu/usr.bin/perl/pod/perlfilter.pod b/gnu/usr.bin/perl/pod/perlfilter.pod index c3c83153adf..4327809ec95 100644 --- a/gnu/usr.bin/perl/pod/perlfilter.pod +++ b/gnu/usr.bin/perl/pod/perlfilter.pod @@ -2,7 +2,6 @@ perlfilter - Source Filters - =head1 DESCRIPTION This article is about a little-known feature of Perl called diff --git a/gnu/usr.bin/perl/pod/perlfork.pod b/gnu/usr.bin/perl/pod/perlfork.pod index d930e9396e8..dc0a82bfd64 100644 --- a/gnu/usr.bin/perl/pod/perlfork.pod +++ b/gnu/usr.bin/perl/pod/perlfork.pod @@ -1,9 +1,14 @@ =head1 NAME -perlfork - Perl's fork() emulation +perlfork - Perl's fork() emulation (EXPERIMENTAL, subject to change) =head1 SYNOPSIS + WARNING: As of the 5.6.1 release, the fork() emulation continues + to be an experimental feature. Use in production applications is + not recommended. See the "BUGS" and "CAVEATS AND LIMITATIONS" + sections below. + Perl provides a fork() keyword that corresponds to the Unix system call of the same name. On most Unix-like platforms where the fork() system call is available, Perl's fork() simply calls it. @@ -11,7 +16,7 @@ call is available, Perl's fork() simply calls it. On some platforms such as Windows where the fork() system call is not available, Perl can be built to emulate fork() at the interpreter level. While the emulation is designed to be as compatible as possible with the -real fork() at the the level of the Perl program, there are certain +real fork() at the level of the Perl program, there are certain important differences that stem from the fact that all the pseudo child "processes" created this way live in the same real process as far as the operating system is concerned. @@ -51,7 +56,7 @@ pseudo-processes are launched after others have been wait()-ed on. =item %ENV -Each pseudo-process maintains its own virtual enviroment. Modifications +Each pseudo-process maintains its own virtual environment. Modifications to %ENV affect the virtual environment, and are only visible within that pseudo-process, and in any processes (or pseudo-processes) launched from it. @@ -274,6 +279,17 @@ are expected to be fixed for thread-safety. =item * +Perl's regular expression engine currently does not play very nicely +with the fork() emulation. There are known race conditions arising +from the regular expression engine modifying state carried in the opcode +tree at run time (the fork() emulation relies on the opcode tree being +immutable). This typically happens when the regex contains paren groups +or variables interpolated within it that force a run time recompilation +of the regex. Due to this major bug, the fork() emulation is not +recommended for use in production applications at this time. + +=item * + Having pseudo-process IDs be negative integers breaks down for the integer C<-1> because the wait() and waitpid() functions treat this number as being special. The tacit assumption in the current implementation is that diff --git a/gnu/usr.bin/perl/pod/perlhack.pod b/gnu/usr.bin/perl/pod/perlhack.pod index c6408702641..d524fe55f5f 100644 --- a/gnu/usr.bin/perl/pod/perlhack.pod +++ b/gnu/usr.bin/perl/pod/perlhack.pod @@ -194,6 +194,8 @@ around. It refers to the standard distribution. ``Hacking on the core'' means you're changing the C source code to the Perl interpreter. ``A core module'' is one that ships with Perl. +=head2 Keeping in sync + The source code to the Perl interpreter, in its different versions, is kept in a repository managed by a revision control system (which is currently the Perforce program, see http://perforce.com/). The @@ -206,20 +208,256 @@ public release are available at this location: ftp://ftp.linux.activestate.com/pub/staff/gsar/APC/ -Selective parts are also visible via the rsync protocol. To get all -the individual changes to the mainline since the last development -release, use the following command: - - rsync -avuz rsync://ftp.linux.activestate.com/perl-diffs perl-diffs - -Use this to get the latest source tree in full: - - rsync -avuz rsync://ftp.linux.activestate.com/perl-current perl-current +If you are a member of the perl5-porters mailing list, it is a good +thing to keep in touch with the most recent changes. If not only to +verify if what you would have posted as a bug report isn't already +solved in the most recent available perl development branch, also +known as perl-current, bleading edge perl, bleedperl or bleadperl. Needless to say, the source code in perl-current is usually in a perpetual state of evolution. You should expect it to be very buggy. Do B<not> use it for any purpose other than testing and development. +Keeping in sync with the most recent branch can be done in several ways, +but the most convenient and reliable way is using B<rsync>, available at +ftp://rsync.samba.org/pub/rsync/ . (You can also get the most recent +branch by FTP.) + +If you choose to keep in sync using rsync, there are two approaches +to doing so: + +=over 4 + +=item rsync'ing the source tree + +Presuming you are in the directory where your perl source resides +and you have rsync installed and available, you can `upgrade' to +the bleadperl using: + + # rsync -avz rsync://ftp.linux.activestate.com/perl-current/ . + +This takes care of updating every single item in the source tree to +the latest applied patch level, creating files that are new (to your +distribution) and setting date/time stamps of existing files to +reflect the bleadperl status. + +You can than check what patch was the latest that was applied by +looking in the file B<.patch>, which will show the number of the +latest patch. + +If you have more than one machine to keep in sync, and not all of +them have access to the WAN (so you are not able to rsync all the +source trees to the real source), there are some ways to get around +this problem. + +=over 4 + +=item Using rsync over the LAN + +Set up a local rsync server which makes the rsynced source tree +available to the LAN and sync the other machines against this +directory. + +From http://rsync.samba.org/README.html: + + "Rsync uses rsh or ssh for communication. It does not need to be + setuid and requires no special privileges for installation. It + does not require a inetd entry or a deamon. You must, however, + have a working rsh or ssh system. Using ssh is recommended for + its security features." + +=item Using pushing over the NFS + +Having the other systems mounted over the NFS, you can take an +active pushing approach by checking the just updated tree against +the other not-yet synced trees. An example would be + + #!/usr/bin/perl -w + + use strict; + use File::Copy; + + my %MF = map { + m/(\S+)/; + $1 => [ (stat $1)[2, 7, 9] ]; # mode, size, mtime + } `cat MANIFEST`; + + my %remote = map { $_ => "/$_/pro/3gl/CPAN/perl-5.7.1" } qw(host1 host2); + + foreach my $host (keys %remote) { + unless (-d $remote{$host}) { + print STDERR "Cannot Xsync for host $host\n"; + next; + } + foreach my $file (keys %MF) { + my $rfile = "$remote{$host}/$file"; + my ($mode, $size, $mtime) = (stat $rfile)[2, 7, 9]; + defined $size or ($mode, $size, $mtime) = (0, 0, 0); + $size == $MF{$file}[1] && $mtime == $MF{$file}[2] and next; + printf "%4s %-34s %8d %9d %8d %9d\n", + $host, $file, $MF{$file}[1], $MF{$file}[2], $size, $mtime; + unlink $rfile; + copy ($file, $rfile); + utime time, $MF{$file}[2], $rfile; + chmod $MF{$file}[0], $rfile; + } + } + +though this is not perfect. It could be improved with checking +file checksums before updating. Not all NFS systems support +reliable utime support (when used over the NFS). + +=back + +=item rsync'ing the patches + +The source tree is maintained by the pumpking who applies patches to +the files in the tree. These patches are either created by the +pumpking himself using C<diff -c> after updating the file manually or +by applying patches sent in by posters on the perl5-porters list. +These patches are also saved and rsync'able, so you can apply them +yourself to the source files. + +Presuming you are in a directory where your patches reside, you can +get them in sync with + + # rsync -avz rsync://ftp.linux.activestate.com/perl-current-diffs/ . + +This makes sure the latest available patch is downloaded to your +patch directory. + +It's then up to you to apply these patches, using something like + + # last=`ls -rt1 *.gz | tail -1` + # rsync -avz rsync://ftp.linux.activestate.com/perl-current-diffs/ . + # find . -name '*.gz' -newer $last -exec gzcat {} \; >blead.patch + # cd ../perl-current + # patch -p1 -N <../perl-current-diffs/blead.patch + +or, since this is only a hint towards how it works, use CPAN-patchaperl +from Andreas König to have better control over the patching process. + +=back + +=head2 Why rsync the source tree + +=over 4 + +=item It's easier + +Since you don't have to apply the patches yourself, you are sure all +files in the source tree are in the right state. + +=item It's more recent + +According to Gurusamy Sarathy: + + "... The rsync mirror is automatic and syncs with the repository + every five minutes. + + "Updating the patch area still requires manual intervention + (with all the goofiness that implies, which you've noted) and + is typically on a daily cycle. Making this process automatic + is on my tuit list, but don't ask me when." + +=item It's more reliable + +Well, since the patches are updated by hand, I don't have to say any +more ... (see Sarathy's remark). + +=back + +=head2 Why rsync the patches + +=over 4 + +=item It's easier + +If you have more than one machine that you want to keep in track with +bleadperl, it's easier to rsync the patches only once and then apply +them to all the source trees on the different machines. + +In case you try to keep in pace on 5 different machines, for which +only one of them has access to the WAN, rsync'ing all the source +trees should than be done 5 times over the NFS. Having +rsync'ed the patches only once, I can apply them to all the source +trees automatically. Need you say more ;-) + +=item It's a good reference + +If you do not only like to have the most recent development branch, +but also like to B<fix> bugs, or extend features, you want to dive +into the sources. If you are a seasoned perl core diver, you don't +need no manuals, tips, roadmaps, perlguts.pod or other aids to find +your way around. But if you are a starter, the patches may help you +in finding where you should start and how to change the bits that +bug you. + +The file B<Changes> is updated on occasions the pumpking sees as his +own little sync points. On those occasions, he releases a tar-ball of +the current source tree (i.e. perl@7582.tar.gz), which will be an +excellent point to start with when choosing to use the 'rsync the +patches' scheme. Starting with perl@7582, which means a set of source +files on which the latest applied patch is number 7582, you apply all +succeeding patches available from then on (7583, 7584, ...). + +You can use the patches later as a kind of search archive. + +=over 4 + +=item Finding a start point + +If you want to fix/change the behaviour of function/feature Foo, just +scan the patches for patches that mention Foo either in the subject, +the comments, or the body of the fix. A good chance the patch shows +you the files that are affected by that patch which are very likely +to be the starting point of your journey into the guts of perl. + +=item Finding how to fix a bug + +If you've found I<where> the function/feature Foo misbehaves, but you +don't know how to fix it (but you do know the change you want to +make), you can, again, peruse the patches for similar changes and +look how others apply the fix. + +=item Finding the source of misbehaviour + +When you keep in sync with bleadperl, the pumpking would love to +I<see> that the community efforts realy work. So after each of his +sync points, you are to 'make test' to check if everything is still +in working order. If it is, you do 'make ok', which will send an OK +report to perlbug@perl.org. (If you do not have access to a mailer +from the system you just finished successfully 'make test', you can +do 'make okfile', which creates the file C<perl.ok>, which you can +than take to your favourite mailer and mail yourself). + +But of course, as always, things will not allways lead to a success +path, and one or more test do not pass the 'make test'. Before +sending in a bug report (using 'make nok' or 'make nokfile'), check +the mailing list if someone else has reported the bug already and if +so, confirm it by replying to that message. If not, you might want to +trace the source of that misbehaviour B<before> sending in the bug, +which will help all the other porters in finding the solution. + +Here the saved patches come in very handy. You can check the list of +patches to see which patch changed what file and what change caused +the misbehaviour. If you note that in the bug report, it saves the +one trying to solve it, looking for that point. + +=back + +If searching the patches is too bothersome, you might consider using +perl's bugtron to find more information about discussions and +ramblings on posted bugs. + +=back + +If you want to get the best of both worlds, rsync both the source +tree for convenience, reliability and ease and rsync the patches +for reference. + +=head2 Submitting patches + Always submit patches to I<perl5-porters@perl.org>. This lets other porters review your patch, which catches a surprising number of errors in patches. Either use the diff program (available in source code @@ -237,7 +475,7 @@ Your patch should update the documentation and test suite. To report a bug in Perl, use the program I<perlbug> which comes with Perl (if you can't get Perl to work, send mail to the address -I<perlbug@perl.com> or I<perlbug@perl.org>). Reporting bugs through +I<perlbug@perl.org> or I<perlbug@perl.com>). Reporting bugs through I<perlbug> feeds into the automated bug-tracking system, access to which is provided through the web at I<http://bugs.perl.org/>. It often pays to check the archives of the perl5-porters mailing list to @@ -251,31 +489,6 @@ volunteers who test CPAN modules on a variety of platforms. Perl Labs platforms and gives feedback to the CPAN testers mailing list. Both efforts welcome volunteers. -To become an active and patching Perl porter, you'll need to learn how -Perl works on the inside. Chip Salzenberg, a pumpking, has written -articles on Perl internals for The Perl Journal -(I<http://www.tpj.com/>) which explain how various parts of the Perl -interpreter work. The C<perlguts> manpage explains the internal data -structures. And, of course, the C source code (sometimes sparsely -commented, sometimes commented well) is a great place to start (begin -with C<perl.c> and see where it goes from there). A lot of the style -of the Perl source is explained in the I<Porting/pumpkin.pod> file in -the source distribution. - -It is essential that you be comfortable using a good debugger -(e.g. gdb, dbx) before you can patch perl. Stepping through perl -as it executes a script is perhaps the best (if sometimes tedious) -way to gain a precise understanding of the overall architecture of -the language. - -If you build a version of the Perl interpreter with C<-DDEBUGGING>, -Perl's B<-D> command line flag will cause copious debugging information -to be emitted (see the C<perlrun> manpage). If you build a version of -Perl with compiler debugging information (e.g. with the C compiler's -C<-g> option instead of C<-O>) then you can step through the execution -of the interpreter with your favourite C symbolic debugger, setting -breakpoints on particular functions. - It's a good idea to read and lurk for a while before chipping in. That way you'll get to see the dynamic of the conversations, learn the personalities of the players, and hopefully be better prepared to make @@ -285,6 +498,1223 @@ If after all this you still think you want to join the perl5-porters mailing list, send mail to I<perl5-porters-subscribe@perl.org>. To unsubscribe, send mail to I<perl5-porters-unsubscribe@perl.org>. +To hack on the Perl guts, you'll need to read the following things: + +=over 3 + +=item L<perlguts> + +This is of paramount importance, since it's the documentation of what +goes where in the Perl source. Read it over a couple of times and it +might start to make sense - don't worry if it doesn't yet, because the +best way to study it is to read it in conjunction with poking at Perl +source, and we'll do that later on. + +You might also want to look at Gisle Aas's illustrated perlguts - +there's no guarantee that this will be absolutely up-to-date with the +latest documentation in the Perl core, but the fundamentals will be +right. (http://gisle.aas.no/perl/illguts/) + +=item L<perlxstut> and L<perlxs> + +A working knowledge of XSUB programming is incredibly useful for core +hacking; XSUBs use techniques drawn from the PP code, the portion of the +guts that actually executes a Perl program. It's a lot gentler to learn +those techniques from simple examples and explanation than from the core +itself. + +=item L<perlapi> + +The documentation for the Perl API explains what some of the internal +functions do, as well as the many macros used in the source. + +=item F<Porting/pumpkin.pod> + +This is a collection of words of wisdom for a Perl porter; some of it is +only useful to the pumpkin holder, but most of it applies to anyone +wanting to go about Perl development. + +=item The perl5-porters FAQ + +This is posted to perl5-porters at the beginning on every month, and +should be available from http://perlhacker.org/p5p-faq; alternatively, +you can get the FAQ emailed to you by sending mail to +C<perl5-porters-faq@perl.org>. It contains hints on reading +perl5-porters, information on how perl5-porters works and how Perl +development in general works. + +=back + +=head2 Finding Your Way Around + +Perl maintenance can be split into a number of areas, and certain people +(pumpkins) will have responsibility for each area. These areas sometimes +correspond to files or directories in the source kit. Among the areas are: + +=over 3 + +=item Core modules + +Modules shipped as part of the Perl core live in the F<lib/> and F<ext/> +subdirectories: F<lib/> is for the pure-Perl modules, and F<ext/> +contains the core XS modules. + +=item Documentation + +Documentation maintenance includes looking after everything in the +F<pod/> directory, (as well as contributing new documentation) and +the documentation to the modules in core. + +=item Configure + +The configure process is the way we make Perl portable across the +myriad of operating systems it supports. Responsibility for the +configure, build and installation process, as well as the overall +portability of the core code rests with the configure pumpkin - others +help out with individual operating systems. + +The files involved are the operating system directories, (F<win32/>, +F<os2/>, F<vms/> and so on) the shell scripts which generate F<config.h> +and F<Makefile>, as well as the metaconfig files which generate +F<Configure>. (metaconfig isn't included in the core distribution.) + +=item Interpreter + +And of course, there's the core of the Perl interpreter itself. Let's +have a look at that in a little more detail. + +=back + +Before we leave looking at the layout, though, don't forget that +F<MANIFEST> contains not only the file names in the Perl distribution, +but short descriptions of what's in them, too. For an overview of the +important files, try this: + + perl -lne 'print if /^[^\/]+\.[ch]\s+/' MANIFEST + +=head2 Elements of the interpreter + +The work of the interpreter has two main stages: compiling the code +into the internal representation, or bytecode, and then executing it. +L<perlguts/Compiled code> explains exactly how the compilation stage +happens. + +Here is a short breakdown of perl's operation: + +=over 3 + +=item Startup + +The action begins in F<perlmain.c>. (or F<miniperlmain.c> for miniperl) +This is very high-level code, enough to fit on a single screen, and it +resembles the code found in L<perlembed>; most of the real action takes +place in F<perl.c> + +First, F<perlmain.c> allocates some memory and constructs a Perl +interpreter: + + 1 PERL_SYS_INIT3(&argc,&argv,&env); + 2 + 3 if (!PL_do_undump) { + 4 my_perl = perl_alloc(); + 5 if (!my_perl) + 6 exit(1); + 7 perl_construct(my_perl); + 8 PL_perl_destruct_level = 0; + 9 } + +Line 1 is a macro, and its definition is dependent on your operating +system. Line 3 references C<PL_do_undump>, a global variable - all +global variables in Perl start with C<PL_>. This tells you whether the +current running program was created with the C<-u> flag to perl and then +F<undump>, which means it's going to be false in any sane context. + +Line 4 calls a function in F<perl.c> to allocate memory for a Perl +interpreter. It's quite a simple function, and the guts of it looks like +this: + + my_perl = (PerlInterpreter*)PerlMem_malloc(sizeof(PerlInterpreter)); + +Here you see an example of Perl's system abstraction, which we'll see +later: C<PerlMem_malloc> is either your system's C<malloc>, or Perl's +own C<malloc> as defined in F<malloc.c> if you selected that option at +configure time. + +Next, in line 7, we construct the interpreter; this sets up all the +special variables that Perl needs, the stacks, and so on. + +Now we pass Perl the command line options, and tell it to go: + + exitstatus = perl_parse(my_perl, xs_init, argc, argv, (char **)NULL); + if (!exitstatus) { + exitstatus = perl_run(my_perl); + } + + +C<perl_parse> is actually a wrapper around C<S_parse_body>, as defined +in F<perl.c>, which processes the command line options, sets up any +statically linked XS modules, opens the program and calls C<yyparse> to +parse it. + +=item Parsing + +The aim of this stage is to take the Perl source, and turn it into an op +tree. We'll see what one of those looks like later. Strictly speaking, +there's three things going on here. + +C<yyparse>, the parser, lives in F<perly.c>, although you're better off +reading the original YACC input in F<perly.y>. (Yes, Virginia, there +B<is> a YACC grammar for Perl!) The job of the parser is to take your +code and `understand' it, splitting it into sentences, deciding which +operands go with which operators and so on. + +The parser is nobly assisted by the lexer, which chunks up your input +into tokens, and decides what type of thing each token is: a variable +name, an operator, a bareword, a subroutine, a core function, and so on. +The main point of entry to the lexer is C<yylex>, and that and its +associated routines can be found in F<toke.c>. Perl isn't much like +other computer languages; it's highly context sensitive at times, it can +be tricky to work out what sort of token something is, or where a token +ends. As such, there's a lot of interplay between the tokeniser and the +parser, which can get pretty frightening if you're not used to it. + +As the parser understands a Perl program, it builds up a tree of +operations for the interpreter to perform during execution. The routines +which construct and link together the various operations are to be found +in F<op.c>, and will be examined later. + +=item Optimization + +Now the parsing stage is complete, and the finished tree represents +the operations that the Perl interpreter needs to perform to execute our +program. Next, Perl does a dry run over the tree looking for +optimisations: constant expressions such as C<3 + 4> will be computed +now, and the optimizer will also see if any multiple operations can be +replaced with a single one. For instance, to fetch the variable C<$foo>, +instead of grabbing the glob C<*foo> and looking at the scalar +component, the optimizer fiddles the op tree to use a function which +directly looks up the scalar in question. The main optimizer is C<peep> +in F<op.c>, and many ops have their own optimizing functions. + +=item Running + +Now we're finally ready to go: we have compiled Perl byte code, and all +that's left to do is run it. The actual execution is done by the +C<runops_standard> function in F<run.c>; more specifically, it's done by +these three innocent looking lines: + + while ((PL_op = CALL_FPTR(PL_op->op_ppaddr)(aTHX))) { + PERL_ASYNC_CHECK(); + } + +You may be more comfortable with the Perl version of that: + + PERL_ASYNC_CHECK() while $Perl::op = &{$Perl::op->{function}}; + +Well, maybe not. Anyway, each op contains a function pointer, which +stipulates the function which will actually carry out the operation. +This function will return the next op in the sequence - this allows for +things like C<if> which choose the next op dynamically at run time. +The C<PERL_ASYNC_CHECK> makes sure that things like signals interrupt +execution if required. + +The actual functions called are known as PP code, and they're spread +between four files: F<pp_hot.c> contains the `hot' code, which is most +often used and highly optimized, F<pp_sys.c> contains all the +system-specific functions, F<pp_ctl.c> contains the functions which +implement control structures (C<if>, C<while> and the like) and F<pp.c> +contains everything else. These are, if you like, the C code for Perl's +built-in functions and operators. + +=back + +=head2 Internal Variable Types + +You should by now have had a look at L<perlguts>, which tells you about +Perl's internal variable types: SVs, HVs, AVs and the rest. If not, do +that now. + +These variables are used not only to represent Perl-space variables, but +also any constants in the code, as well as some structures completely +internal to Perl. The symbol table, for instance, is an ordinary Perl +hash. Your code is represented by an SV as it's read into the parser; +any program files you call are opened via ordinary Perl filehandles, and +so on. + +The core L<Devel::Peek|Devel::Peek> module lets us examine SVs from a +Perl program. Let's see, for instance, how Perl treats the constant +C<"hello">. + + % perl -MDevel::Peek -e 'Dump("hello")' + 1 SV = PV(0xa041450) at 0xa04ecbc + 2 REFCNT = 1 + 3 FLAGS = (POK,READONLY,pPOK) + 4 PV = 0xa0484e0 "hello"\0 + 5 CUR = 5 + 6 LEN = 6 + +Reading C<Devel::Peek> output takes a bit of practise, so let's go +through it line by line. + +Line 1 tells us we're looking at an SV which lives at C<0xa04ecbc> in +memory. SVs themselves are very simple structures, but they contain a +pointer to a more complex structure. In this case, it's a PV, a +structure which holds a string value, at location C<0xa041450>. Line 2 +is the reference count; there are no other references to this data, so +it's 1. + +Line 3 are the flags for this SV - it's OK to use it as a PV, it's a +read-only SV (because it's a constant) and the data is a PV internally. +Next we've got the contents of the string, starting at location +C<0xa0484e0>. + +Line 5 gives us the current length of the string - note that this does +B<not> include the null terminator. Line 6 is not the length of the +string, but the length of the currently allocated buffer; as the string +grows, Perl automatically extends the available storage via a routine +called C<SvGROW>. + +You can get at any of these quantities from C very easily; just add +C<Sv> to the name of the field shown in the snippet, and you've got a +macro which will return the value: C<SvCUR(sv)> returns the current +length of the string, C<SvREFCOUNT(sv)> returns the reference count, +C<SvPV(sv, len)> returns the string itself with its length, and so on. +More macros to manipulate these properties can be found in L<perlguts>. + +Let's take an example of manipulating a PV, from C<sv_catpvn>, in F<sv.c> + + 1 void + 2 Perl_sv_catpvn(pTHX_ register SV *sv, register const char *ptr, register STRLEN len) + 3 { + 4 STRLEN tlen; + 5 char *junk; + + 6 junk = SvPV_force(sv, tlen); + 7 SvGROW(sv, tlen + len + 1); + 8 if (ptr == junk) + 9 ptr = SvPVX(sv); + 10 Move(ptr,SvPVX(sv)+tlen,len,char); + 11 SvCUR(sv) += len; + 12 *SvEND(sv) = '\0'; + 13 (void)SvPOK_only_UTF8(sv); /* validate pointer */ + 14 SvTAINT(sv); + 15 } + +This is a function which adds a string, C<ptr>, of length C<len> onto +the end of the PV stored in C<sv>. The first thing we do in line 6 is +make sure that the SV B<has> a valid PV, by calling the C<SvPV_force> +macro to force a PV. As a side effect, C<tlen> gets set to the current +value of the PV, and the PV itself is returned to C<junk>. + +In line 7, we make sure that the SV will have enough room to accommodate +the old string, the new string and the null terminator. If C<LEN> isn't +big enough, C<SvGROW> will reallocate space for us. + +Now, if C<junk> is the same as the string we're trying to add, we can +grab the string directly from the SV; C<SvPVX> is the address of the PV +in the SV. + +Line 10 does the actual catenation: the C<Move> macro moves a chunk of +memory around: we move the string C<ptr> to the end of the PV - that's +the start of the PV plus its current length. We're moving C<len> bytes +of type C<char>. After doing so, we need to tell Perl we've extended the +string, by altering C<CUR> to reflect the new length. C<SvEND> is a +macro which gives us the end of the string, so that needs to be a +C<"\0">. + +Line 13 manipulates the flags; since we've changed the PV, any IV or NV +values will no longer be valid: if we have C<$a=10; $a.="6";> we don't +want to use the old IV of 10. C<SvPOK_only_utf8> is a special UTF8-aware +version of C<SvPOK_only>, a macro which turns off the IOK and NOK flags +and turns on POK. The final C<SvTAINT> is a macro which launders tainted +data if taint mode is turned on. + +AVs and HVs are more complicated, but SVs are by far the most common +variable type being thrown around. Having seen something of how we +manipulate these, let's go on and look at how the op tree is +constructed. + +=head2 Op Trees + +First, what is the op tree, anyway? The op tree is the parsed +representation of your program, as we saw in our section on parsing, and +it's the sequence of operations that Perl goes through to execute your +program, as we saw in L</Running>. + +An op is a fundamental operation that Perl can perform: all the built-in +functions and operators are ops, and there are a series of ops which +deal with concepts the interpreter needs internally - entering and +leaving a block, ending a statement, fetching a variable, and so on. + +The op tree is connected in two ways: you can imagine that there are two +"routes" through it, two orders in which you can traverse the tree. +First, parse order reflects how the parser understood the code, and +secondly, execution order tells perl what order to perform the +operations in. + +The easiest way to examine the op tree is to stop Perl after it has +finished parsing, and get it to dump out the tree. This is exactly what +the compiler backends L<B::Terse|B::Terse> and L<B::Debug|B::Debug> do. + +Let's have a look at how Perl sees C<$a = $b + $c>: + + % perl -MO=Terse -e '$a=$b+$c' + 1 LISTOP (0x8179888) leave + 2 OP (0x81798b0) enter + 3 COP (0x8179850) nextstate + 4 BINOP (0x8179828) sassign + 5 BINOP (0x8179800) add [1] + 6 UNOP (0x81796e0) null [15] + 7 SVOP (0x80fafe0) gvsv GV (0x80fa4cc) *b + 8 UNOP (0x81797e0) null [15] + 9 SVOP (0x8179700) gvsv GV (0x80efeb0) *c + 10 UNOP (0x816b4f0) null [15] + 11 SVOP (0x816dcf0) gvsv GV (0x80fa460) *a + +Let's start in the middle, at line 4. This is a BINOP, a binary +operator, which is at location C<0x8179828>. The specific operator in +question is C<sassign> - scalar assignment - and you can find the code +which implements it in the function C<pp_sassign> in F<pp_hot.c>. As a +binary operator, it has two children: the add operator, providing the +result of C<$b+$c>, is uppermost on line 5, and the left hand side is on +line 10. + +Line 10 is the null op: this does exactly nothing. What is that doing +there? If you see the null op, it's a sign that something has been +optimized away after parsing. As we mentioned in L</Optimization>, +the optimization stage sometimes converts two operations into one, for +example when fetching a scalar variable. When this happens, instead of +rewriting the op tree and cleaning up the dangling pointers, it's easier +just to replace the redundant operation with the null op. Originally, +the tree would have looked like this: + + 10 SVOP (0x816b4f0) rv2sv [15] + 11 SVOP (0x816dcf0) gv GV (0x80fa460) *a + +That is, fetch the C<a> entry from the main symbol table, and then look +at the scalar component of it: C<gvsv> (C<pp_gvsv> into F<pp_hot.c>) +happens to do both these things. + +The right hand side, starting at line 5 is similar to what we've just +seen: we have the C<add> op (C<pp_add> also in F<pp_hot.c>) add together +two C<gvsv>s. + +Now, what's this about? + + 1 LISTOP (0x8179888) leave + 2 OP (0x81798b0) enter + 3 COP (0x8179850) nextstate + +C<enter> and C<leave> are scoping ops, and their job is to perform any +housekeeping every time you enter and leave a block: lexical variables +are tidied up, unreferenced variables are destroyed, and so on. Every +program will have those first three lines: C<leave> is a list, and its +children are all the statements in the block. Statements are delimited +by C<nextstate>, so a block is a collection of C<nextstate> ops, with +the ops to be performed for each statement being the children of +C<nextstate>. C<enter> is a single op which functions as a marker. + +That's how Perl parsed the program, from top to bottom: + + Program + | + Statement + | + = + / \ + / \ + $a + + / \ + $b $c + +However, it's impossible to B<perform> the operations in this order: +you have to find the values of C<$b> and C<$c> before you add them +together, for instance. So, the other thread that runs through the op +tree is the execution order: each op has a field C<op_next> which points +to the next op to be run, so following these pointers tells us how perl +executes the code. We can traverse the tree in this order using +the C<exec> option to C<B::Terse>: + + % perl -MO=Terse,exec -e '$a=$b+$c' + 1 OP (0x8179928) enter + 2 COP (0x81798c8) nextstate + 3 SVOP (0x81796c8) gvsv GV (0x80fa4d4) *b + 4 SVOP (0x8179798) gvsv GV (0x80efeb0) *c + 5 BINOP (0x8179878) add [1] + 6 SVOP (0x816dd38) gvsv GV (0x80fa468) *a + 7 BINOP (0x81798a0) sassign + 8 LISTOP (0x8179900) leave + +This probably makes more sense for a human: enter a block, start a +statement. Get the values of C<$b> and C<$c>, and add them together. +Find C<$a>, and assign one to the other. Then leave. + +The way Perl builds up these op trees in the parsing process can be +unravelled by examining F<perly.y>, the YACC grammar. Let's take the +piece we need to construct the tree for C<$a = $b + $c> + + 1 term : term ASSIGNOP term + 2 { $$ = newASSIGNOP(OPf_STACKED, $1, $2, $3); } + 3 | term ADDOP term + 4 { $$ = newBINOP($2, 0, scalar($1), scalar($3)); } + +If you're not used to reading BNF grammars, this is how it works: You're +fed certain things by the tokeniser, which generally end up in upper +case. Here, C<ADDOP>, is provided when the tokeniser sees C<+> in your +code. C<ASSIGNOP> is provided when C<=> is used for assigning. These are +`terminal symbols', because you can't get any simpler than them. + +The grammar, lines one and three of the snippet above, tells you how to +build up more complex forms. These complex forms, `non-terminal symbols' +are generally placed in lower case. C<term> here is a non-terminal +symbol, representing a single expression. + +The grammar gives you the following rule: you can make the thing on the +left of the colon if you see all the things on the right in sequence. +This is called a "reduction", and the aim of parsing is to completely +reduce the input. There are several different ways you can perform a +reduction, separated by vertical bars: so, C<term> followed by C<=> +followed by C<term> makes a C<term>, and C<term> followed by C<+> +followed by C<term> can also make a C<term>. + +So, if you see two terms with an C<=> or C<+>, between them, you can +turn them into a single expression. When you do this, you execute the +code in the block on the next line: if you see C<=>, you'll do the code +in line 2. If you see C<+>, you'll do the code in line 4. It's this code +which contributes to the op tree. + + | term ADDOP term + { $$ = newBINOP($2, 0, scalar($1), scalar($3)); } + +What this does is creates a new binary op, and feeds it a number of +variables. The variables refer to the tokens: C<$1> is the first token in +the input, C<$2> the second, and so on - think regular expression +backreferences. C<$$> is the op returned from this reduction. So, we +call C<newBINOP> to create a new binary operator. The first parameter to +C<newBINOP>, a function in F<op.c>, is the op type. It's an addition +operator, so we want the type to be C<ADDOP>. We could specify this +directly, but it's right there as the second token in the input, so we +use C<$2>. The second parameter is the op's flags: 0 means `nothing +special'. Then the things to add: the left and right hand side of our +expression, in scalar context. + +=head2 Stacks + +When perl executes something like C<addop>, how does it pass on its +results to the next op? The answer is, through the use of stacks. Perl +has a number of stacks to store things it's currently working on, and +we'll look at the three most important ones here. + +=over 3 + +=item Argument stack + +Arguments are passed to PP code and returned from PP code using the +argument stack, C<ST>. The typical way to handle arguments is to pop +them off the stack, deal with them how you wish, and then push the result +back onto the stack. This is how, for instance, the cosine operator +works: + + NV value; + value = POPn; + value = Perl_cos(value); + XPUSHn(value); + +We'll see a more tricky example of this when we consider Perl's macros +below. C<POPn> gives you the NV (floating point value) of the top SV on +the stack: the C<$x> in C<cos($x)>. Then we compute the cosine, and push +the result back as an NV. The C<X> in C<XPUSHn> means that the stack +should be extended if necessary - it can't be necessary here, because we +know there's room for one more item on the stack, since we've just +removed one! The C<XPUSH*> macros at least guarantee safety. + +Alternatively, you can fiddle with the stack directly: C<SP> gives you +the first element in your portion of the stack, and C<TOP*> gives you +the top SV/IV/NV/etc. on the stack. So, for instance, to do unary +negation of an integer: + + SETi(-TOPi); + +Just set the integer value of the top stack entry to its negation. + +Argument stack manipulation in the core is exactly the same as it is in +XSUBs - see L<perlxstut>, L<perlxs> and L<perlguts> for a longer +description of the macros used in stack manipulation. + +=item Mark stack + +I say `your portion of the stack' above because PP code doesn't +necessarily get the whole stack to itself: if your function calls +another function, you'll only want to expose the arguments aimed for the +called function, and not (necessarily) let it get at your own data. The +way we do this is to have a `virtual' bottom-of-stack, exposed to each +function. The mark stack keeps bookmarks to locations in the argument +stack usable by each function. For instance, when dealing with a tied +variable, (internally, something with `P' magic) Perl has to call +methods for accesses to the tied variables. However, we need to separate +the arguments exposed to the method to the argument exposed to the +original function - the store or fetch or whatever it may be. Here's how +the tied C<push> is implemented; see C<av_push> in F<av.c>: + + 1 PUSHMARK(SP); + 2 EXTEND(SP,2); + 3 PUSHs(SvTIED_obj((SV*)av, mg)); + 4 PUSHs(val); + 5 PUTBACK; + 6 ENTER; + 7 call_method("PUSH", G_SCALAR|G_DISCARD); + 8 LEAVE; + 9 POPSTACK; + +The lines which concern the mark stack are the first, fifth and last +lines: they save away, restore and remove the current position of the +argument stack. + +Let's examine the whole implementation, for practice: + + 1 PUSHMARK(SP); + +Push the current state of the stack pointer onto the mark stack. This is +so that when we've finished adding items to the argument stack, Perl +knows how many things we've added recently. + + 2 EXTEND(SP,2); + 3 PUSHs(SvTIED_obj((SV*)av, mg)); + 4 PUSHs(val); + +We're going to add two more items onto the argument stack: when you have +a tied array, the C<PUSH> subroutine receives the object and the value +to be pushed, and that's exactly what we have here - the tied object, +retrieved with C<SvTIED_obj>, and the value, the SV C<val>. + + 5 PUTBACK; + +Next we tell Perl to make the change to the global stack pointer: C<dSP> +only gave us a local copy, not a reference to the global. + + 6 ENTER; + 7 call_method("PUSH", G_SCALAR|G_DISCARD); + 8 LEAVE; + +C<ENTER> and C<LEAVE> localise a block of code - they make sure that all +variables are tidied up, everything that has been localised gets +its previous value returned, and so on. Think of them as the C<{> and +C<}> of a Perl block. + +To actually do the magic method call, we have to call a subroutine in +Perl space: C<call_method> takes care of that, and it's described in +L<perlcall>. We call the C<PUSH> method in scalar context, and we're +going to discard its return value. + + 9 POPSTACK; + +Finally, we remove the value we placed on the mark stack, since we +don't need it any more. + +=item Save stack + +C doesn't have a concept of local scope, so perl provides one. We've +seen that C<ENTER> and C<LEAVE> are used as scoping braces; the save +stack implements the C equivalent of, for example: + + { + local $foo = 42; + ... + } + +See L<perlguts/Localising Changes> for how to use the save stack. + +=back + +=head2 Millions of Macros + +One thing you'll notice about the Perl source is that it's full of +macros. Some have called the pervasive use of macros the hardest thing +to understand, others find it adds to clarity. Let's take an example, +the code which implements the addition operator: + + 1 PP(pp_add) + 2 { + 3 dSP; dATARGET; tryAMAGICbin(add,opASSIGN); + 4 { + 5 dPOPTOPnnrl_ul; + 6 SETn( left + right ); + 7 RETURN; + 8 } + 9 } + +Every line here (apart from the braces, of course) contains a macro. The +first line sets up the function declaration as Perl expects for PP code; +line 3 sets up variable declarations for the argument stack and the +target, the return value of the operation. Finally, it tries to see if +the addition operation is overloaded; if so, the appropriate subroutine +is called. + +Line 5 is another variable declaration - all variable declarations start +with C<d> - which pops from the top of the argument stack two NVs (hence +C<nn>) and puts them into the variables C<right> and C<left>, hence the +C<rl>. These are the two operands to the addition operator. Next, we +call C<SETn> to set the NV of the return value to the result of adding +the two values. This done, we return - the C<RETURN> macro makes sure +that our return value is properly handled, and we pass the next operator +to run back to the main run loop. + +Most of these macros are explained in L<perlapi>, and some of the more +important ones are explained in L<perlxs> as well. Pay special attention +to L<perlguts/Background and PERL_IMPLICIT_CONTEXT> for information on +the C<[pad]THX_?> macros. + + +=head2 Poking at Perl + +To really poke around with Perl, you'll probably want to build Perl for +debugging, like this: + + ./Configure -d -D optimize=-g + make + +C<-g> is a flag to the C compiler to have it produce debugging +information which will allow us to step through a running program. +F<Configure> will also turn on the C<DEBUGGING> compilation symbol which +enables all the internal debugging code in Perl. There are a whole bunch +of things you can debug with this: L<perlrun> lists them all, and the +best way to find out about them is to play about with them. The most +useful options are probably + + l Context (loop) stack processing + t Trace execution + o Method and overloading resolution + c String/numeric conversions + +Some of the functionality of the debugging code can be achieved using XS +modules. + + -Dr => use re 'debug' + -Dx => use O 'Debug' + +=head2 Using a source-level debugger + +If the debugging output of C<-D> doesn't help you, it's time to step +through perl's execution with a source-level debugger. + +=over 3 + +=item * + +We'll use C<gdb> for our examples here; the principles will apply to any +debugger, but check the manual of the one you're using. + +=back + +To fire up the debugger, type + + gdb ./perl + +You'll want to do that in your Perl source tree so the debugger can read +the source code. You should see the copyright message, followed by the +prompt. + + (gdb) + +C<help> will get you into the documentation, but here are the most +useful commands: + +=over 3 + +=item run [args] + +Run the program with the given arguments. + +=item break function_name + +=item break source.c:xxx + +Tells the debugger that we'll want to pause execution when we reach +either the named function (but see L<perlguts/Internal Functions>!) or the given +line in the named source file. + +=item step + +Steps through the program a line at a time. + +=item next + +Steps through the program a line at a time, without descending into +functions. + +=item continue + +Run until the next breakpoint. + +=item finish + +Run until the end of the current function, then stop again. + +=item 'enter' + +Just pressing Enter will do the most recent operation again - it's a +blessing when stepping through miles of source code. + +=item print + +Execute the given C code and print its results. B<WARNING>: Perl makes +heavy use of macros, and F<gdb> is not aware of macros. You'll have to +substitute them yourself. So, for instance, you can't say + + print SvPV_nolen(sv) + +but you have to say + + print Perl_sv_2pv_nolen(sv) + +You may find it helpful to have a "macro dictionary", which you can +produce by saying C<cpp -dM perl.c | sort>. Even then, F<cpp> won't +recursively apply the macros for you. + +=back + +=head2 Dumping Perl Data Structures + +One way to get around this macro hell is to use the dumping functions in +F<dump.c>; these work a little like an internal +L<Devel::Peek|Devel::Peek>, but they also cover OPs and other structures +that you can't get at from Perl. Let's take an example. We'll use the +C<$a = $b + $c> we used before, but give it a bit of context: +C<$b = "6XXXX"; $c = 2.3;>. Where's a good place to stop and poke around? + +What about C<pp_add>, the function we examined earlier to implement the +C<+> operator: + + (gdb) break Perl_pp_add + Breakpoint 1 at 0x46249f: file pp_hot.c, line 309. + +Notice we use C<Perl_pp_add> and not C<pp_add> - see L<perlguts/Internal Functions>. +With the breakpoint in place, we can run our program: + + (gdb) run -e '$b = "6XXXX"; $c = 2.3; $a = $b + $c' + +Lots of junk will go past as gdb reads in the relevant source files and +libraries, and then: + + Breakpoint 1, Perl_pp_add () at pp_hot.c:309 + 309 dSP; dATARGET; tryAMAGICbin(add,opASSIGN); + (gdb) step + 311 dPOPTOPnnrl_ul; + (gdb) + +We looked at this bit of code before, and we said that C<dPOPTOPnnrl_ul> +arranges for two C<NV>s to be placed into C<left> and C<right> - let's +slightly expand it: + + #define dPOPTOPnnrl_ul NV right = POPn; \ + SV *leftsv = TOPs; \ + NV left = USE_LEFT(leftsv) ? SvNV(leftsv) : 0.0 + +C<POPn> takes the SV from the top of the stack and obtains its NV either +directly (if C<SvNOK> is set) or by calling the C<sv_2nv> function. +C<TOPs> takes the next SV from the top of the stack - yes, C<POPn> uses +C<TOPs> - but doesn't remove it. We then use C<SvNV> to get the NV from +C<leftsv> in the same way as before - yes, C<POPn> uses C<SvNV>. + +Since we don't have an NV for C<$b>, we'll have to use C<sv_2nv> to +convert it. If we step again, we'll find ourselves there: + + Perl_sv_2nv (sv=0xa0675d0) at sv.c:1669 + 1669 if (!sv) + (gdb) + +We can now use C<Perl_sv_dump> to investigate the SV: + + SV = PV(0xa057cc0) at 0xa0675d0 + REFCNT = 1 + FLAGS = (POK,pPOK) + PV = 0xa06a510 "6XXXX"\0 + CUR = 5 + LEN = 6 + $1 = void + +We know we're going to get C<6> from this, so let's finish the +subroutine: + + (gdb) finish + Run till exit from #0 Perl_sv_2nv (sv=0xa0675d0) at sv.c:1671 + 0x462669 in Perl_pp_add () at pp_hot.c:311 + 311 dPOPTOPnnrl_ul; + +We can also dump out this op: the current op is always stored in +C<PL_op>, and we can dump it with C<Perl_op_dump>. This'll give us +similar output to L<B::Debug|B::Debug>. + + { + 13 TYPE = add ===> 14 + TARG = 1 + FLAGS = (SCALAR,KIDS) + { + TYPE = null ===> (12) + (was rv2sv) + FLAGS = (SCALAR,KIDS) + { + 11 TYPE = gvsv ===> 12 + FLAGS = (SCALAR) + GV = main::b + } + } + +< finish this later > + +=head2 Patching + +All right, we've now had a look at how to navigate the Perl sources and +some things you'll need to know when fiddling with them. Let's now get +on and create a simple patch. Here's something Larry suggested: if a +C<U> is the first active format during a C<pack>, (for example, +C<pack "U3C8", @stuff>) then the resulting string should be treated as +UTF8 encoded. + +How do we prepare to fix this up? First we locate the code in question - +the C<pack> happens at runtime, so it's going to be in one of the F<pp> +files. Sure enough, C<pp_pack> is in F<pp.c>. Since we're going to be +altering this file, let's copy it to F<pp.c~>. + +Now let's look over C<pp_pack>: we take a pattern into C<pat>, and then +loop over the pattern, taking each format character in turn into +C<datum_type>. Then for each possible format character, we swallow up +the other arguments in the pattern (a field width, an asterisk, and so +on) and convert the next chunk input into the specified format, adding +it onto the output SV C<cat>. + +How do we know if the C<U> is the first format in the C<pat>? Well, if +we have a pointer to the start of C<pat> then, if we see a C<U> we can +test whether we're still at the start of the string. So, here's where +C<pat> is set up: + + STRLEN fromlen; + register char *pat = SvPVx(*++MARK, fromlen); + register char *patend = pat + fromlen; + register I32 len; + I32 datumtype; + SV *fromstr; + +We'll have another string pointer in there: + + STRLEN fromlen; + register char *pat = SvPVx(*++MARK, fromlen); + register char *patend = pat + fromlen; + + char *patcopy; + register I32 len; + I32 datumtype; + SV *fromstr; + +And just before we start the loop, we'll set C<patcopy> to be the start +of C<pat>: + + items = SP - MARK; + MARK++; + sv_setpvn(cat, "", 0); + + patcopy = pat; + while (pat < patend) { + +Now if we see a C<U> which was at the start of the string, we turn on +the UTF8 flag for the output SV, C<cat>: + + + if (datumtype == 'U' && pat==patcopy+1) + + SvUTF8_on(cat); + if (datumtype == '#') { + while (pat < patend && *pat != '\n') + pat++; + +Remember that it has to be C<patcopy+1> because the first character of +the string is the C<U> which has been swallowed into C<datumtype!> + +Oops, we forgot one thing: what if there are spaces at the start of the +pattern? C<pack(" U*", @stuff)> will have C<U> as the first active +character, even though it's not the first thing in the pattern. In this +case, we have to advance C<patcopy> along with C<pat> when we see spaces: + + if (isSPACE(datumtype)) + continue; + +needs to become + + if (isSPACE(datumtype)) { + patcopy++; + continue; + } + +OK. That's the C part done. Now we must do two additional things before +this patch is ready to go: we've changed the behaviour of Perl, and so +we must document that change. We must also provide some more regression +tests to make sure our patch works and doesn't create a bug somewhere +else along the line. + +The regression tests for each operator live in F<t/op/>, and so we make +a copy of F<t/op/pack.t> to F<t/op/pack.t~>. Now we can add our tests +to the end. First, we'll test that the C<U> does indeed create Unicode +strings: + + print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000); + print "ok $test\n"; $test++; + +Now we'll test that we got that space-at-the-beginning business right: + + print 'not ' unless "1.20.300.4000" eq + sprintf "%vd", pack(" U*",1,20,300,4000); + print "ok $test\n"; $test++; + +And finally we'll test that we don't make Unicode strings if C<U> is B<not> +the first active format: + + print 'not ' unless v1.20.300.4000 ne + sprintf "%vd", pack("C0U*",1,20,300,4000); + print "ok $test\n"; $test++; + +Mustn't forget to change the number of tests which appears at the top, or +else the automated tester will get confused: + + -print "1..156\n"; + +print "1..159\n"; + +We now compile up Perl, and run it through the test suite. Our new +tests pass, hooray! + +Finally, the documentation. The job is never done until the paperwork is +over, so let's describe the change we've just made. The relevant place +is F<pod/perlfunc.pod>; again, we make a copy, and then we'll insert +this text in the description of C<pack>: + + =item * + + If the pattern begins with a C<U>, the resulting string will be treated + as Unicode-encoded. You can force UTF8 encoding on in a string with an + initial C<U0>, and the bytes that follow will be interpreted as Unicode + characters. If you don't want this to happen, you can begin your pattern + with C<C0> (or anything else) to force Perl not to UTF8 encode your + string, and then follow this with a C<U*> somewhere in your pattern. + +All done. Now let's create the patch. F<Porting/patching.pod> tells us +that if we're making major changes, we should copy the entire directory +to somewhere safe before we begin fiddling, and then do + + diff -ruN old new > patch + +However, we know which files we've changed, and we can simply do this: + + diff -u pp.c~ pp.c > patch + diff -u t/op/pack.t~ t/op/pack.t >> patch + diff -u pod/perlfunc.pod~ pod/perlfunc.pod >> patch + +We end up with a patch looking a little like this: + + --- pp.c~ Fri Jun 02 04:34:10 2000 + +++ pp.c Fri Jun 16 11:37:25 2000 + @@ -4375,6 +4375,7 @@ + register I32 items; + STRLEN fromlen; + register char *pat = SvPVx(*++MARK, fromlen); + + char *patcopy; + register char *patend = pat + fromlen; + register I32 len; + I32 datumtype; + @@ -4405,6 +4406,7 @@ + ... + +And finally, we submit it, with our rationale, to perl5-porters. Job +done! + +=head1 EXTERNAL TOOLS FOR DEBUGGING PERL + +Sometimes it helps to use external tools while debugging and +testing Perl. This section tries to guide you through using +some common testing and debugging tools with Perl. This is +meant as a guide to interfacing these tools with Perl, not +as any kind of guide to the use of the tools themselves. + +=head2 Rational Software's Purify + +Purify is a commercial tool that is helpful in identifying +memory overruns, wild pointers, memory leaks and other such +badness. Perl must be compiled in a specific way for +optimal testing with Purify. Purify is available under +Windows NT, Solaris, HP-UX, SGI, and Siemens Unix. + +The only currently known leaks happen when there are +compile-time errors within eval or require. (Fixing these +is non-trivial, unfortunately, but they must be fixed +eventually.) + +=head2 Purify on Unix + +On Unix, Purify creates a new Perl binary. To get the most +benefit out of Purify, you should create the perl to Purify +using: + + sh Configure -Accflags=-DPURIFY -Doptimize='-g' \ + -Uusemymalloc -Dusemultiplicity + +where these arguments mean: + +=over 4 + +=item -Accflags=-DPURIFY + +Disables Perl's arena memory allocation functions, as well as +forcing use of memory allocation functions derived from the +system malloc. + +=item -Doptimize='-g' + +Adds debugging information so that you see the exact source +statements where the problem occurs. Without this flag, all +you will see is the source filename of where the error occurred. + +=item -Uusemymalloc + +Disable Perl's malloc so that Purify can more closely monitor +allocations and leaks. Using Perl's malloc will make Purify +report most leaks in the "potential" leaks category. + +=item -Dusemultiplicity + +Enabling the multiplicity option allows perl to clean up +thoroughly when the interpreter shuts down, which reduces the +number of bogus leak reports from Purify. + +=back + +Once you've compiled a perl suitable for Purify'ing, then you +can just: + + make pureperl + +which creates a binary named 'pureperl' that has been Purify'ed. +This binary is used in place of the standard 'perl' binary +when you want to debug Perl memory problems. + +As an example, to show any memory leaks produced during the +standard Perl testset you would create and run the Purify'ed +perl as: + + make pureperl + cd t + ../pureperl -I../lib harness + +which would run Perl on test.pl and report any memory problems. + +Purify outputs messages in "Viewer" windows by default. If +you don't have a windowing environment or if you simply +want the Purify output to unobtrusively go to a log file +instead of to the interactive window, use these following +options to output to the log file "perl.log": + + setenv PURIFYOPTIONS "-chain-length=25 -windows=no \ + -log-file=perl.log -append-logfile=yes" + +If you plan to use the "Viewer" windows, then you only need this option: + + setenv PURIFYOPTIONS "-chain-length=25" + +=head2 Purify on NT + +Purify on Windows NT instruments the Perl binary 'perl.exe' +on the fly. There are several options in the makefile you +should change to get the most use out of Purify: + +=over 4 + +=item DEFINES + +You should add -DPURIFY to the DEFINES line so the DEFINES +line looks something like: + + DEFINES = -DWIN32 -D_CONSOLE -DNO_STRICT $(CRYPT_FLAG) -DPURIFY=1 + +to disable Perl's arena memory allocation functions, as +well as to force use of memory allocation functions derived +from the system malloc. + +=item USE_MULTI = define + +Enabling the multiplicity option allows perl to clean up +thoroughly when the interpreter shuts down, which reduces the +number of bogus leak reports from Purify. + +=item #PERL_MALLOC = define + +Disable Perl's malloc so that Purify can more closely monitor +allocations and leaks. Using Perl's malloc will make Purify +report most leaks in the "potential" leaks category. + +=item CFG = Debug + +Adds debugging information so that you see the exact source +statements where the problem occurs. Without this flag, all +you will see is the source filename of where the error occurred. + +=back + +As an example, to show any memory leaks produced during the +standard Perl testset you would create and run Purify as: + + cd win32 + make + cd ../t + purify ../perl -I../lib harness + +which would instrument Perl in memory, run Perl on test.pl, +then finally report any memory problems. + +=head2 CONCLUSION + +We've had a brief look around the Perl source, an overview of the stages +F<perl> goes through when it's running your code, and how to use a +debugger to poke at the Perl guts. We took a very simple problem and +demonstrated how to solve it fully - with documentation, regression +tests, and finally a patch for submission to p5p. Finally, we talked +about how to use external tools to debug and test Perl. + +I'd now suggest you read over those references again, and then, as soon +as possible, get your hands dirty. The best way to learn is by doing, +so: + +=over 3 + +=item * + +Subscribe to perl5-porters, follow the patches and try and understand +them; don't be afraid to ask if there's a portion you're not clear on - +who knows, you may unearth a bug in the patch... + +=item * + +Keep up to date with the bleeding edge Perl distributions and get +familiar with the changes. Try and get an idea of what areas people are +working on and the changes they're making. + +=item * + +Do read the README associated with your operating system, e.g. README.aix +on the IBM AIX OS. Don't hesitate to supply patches to that README if +you find anything missing or changed over a new OS release. + +=item * + +Find an area of Perl that seems interesting to you, and see if you can +work out how it works. Scan through the source, and step over it in the +debugger. Play, poke, investigate, fiddle! You'll probably get to +understand not just your chosen area but a much wider range of F<perl>'s +activity as well, and probably sooner than you'd think. + +=back + +=over 3 + +=item I<The Road goes ever on and on, down from the door where it began.> + +=back + +If you can do these things, you've started on the long road to Perl porting. +Thanks for wanting to help make Perl better - and happy hacking! + =head1 AUTHOR This document was written by Nathan Torkington, and is maintained by diff --git a/gnu/usr.bin/perl/pod/perllexwarn.pod b/gnu/usr.bin/perl/pod/perllexwarn.pod index cee16875377..951a470b2e5 100644 --- a/gnu/usr.bin/perl/pod/perllexwarn.pod +++ b/gnu/usr.bin/perl/pod/perllexwarn.pod @@ -9,7 +9,7 @@ flag B<-w> and the equivalent Perl variable, C<$^W>. The pragma works just like the existing "strict" pragma. This means that the scope of the warning pragma is limited to the -enclosing block. It also means that that the pragma setting will not +enclosing block. It also means that the pragma setting will not leak across files (via C<use>, C<require> or C<do>). This allows authors to independently define the degree of warning checks that will be applied to their module. @@ -30,18 +30,17 @@ Similarly all warnings are disabled in a block by either of these: For example, consider the code below: use warnings ; - my $a ; - my $b ; + my @a ; { no warnings ; - $b = 2 if $a EQ 3 ; + my $b = @a[0] ; } - $b = 1 if $a NE 3 ; + my $c = @a[0]; The code in the enclosing block has warnings enabled, but the inner -block has them disabled. In this case that means that the use of the C<EQ> -operator won't trip a C<"Use of EQ is deprecated"> warning, but the use of -C<NE> will produce a C<"Use of NE is deprecated"> warning. +block has them disabled. In this case that means the assignment to the +scalar C<$c> will trip the C<"Scalar value @a[0] better written as $a[0]"> +warning, but the assignment to the scalar C<$b> will not. =head2 Default Warnings and Optional Warnings @@ -100,7 +99,7 @@ disable compile-time warnings you need to rewrite the code like this: my $b ; chop $b ; } -The other big problem with C<$^W> is that way you can inadvertently +The other big problem with C<$^W> is the way you can inadvertently change the warning setting in unexpected places in your code. For example, when the code below is run (without the B<-w> flag), the second call to C<doit> will trip a C<"Use of uninitialized value"> warning, whereas @@ -195,7 +194,7 @@ or B<-X> command line flags. =back -The combined effect of 3 & 4 is that it will will allow code which uses +The combined effect of 3 & 4 is that it will allow code which uses the C<warnings> pragma to control the warning behavior of $^W-type code (using a C<local $^W=0>) if it really wants to, but not vice-versa. @@ -321,27 +320,38 @@ L<perldiag>. The presence of the word "FATAL" in the category list will escalate any warnings detected from the categories specified in the lexical scope -into fatal errors. In the code below, there are 3 places where a -deprecated warning will be detected, the middle one will produce a -fatal error. - +into fatal errors. In the code below, the use of C<time>, C<length> +and C<join> can all produce a C<"Useless use of xxx in void context"> +warning. use warnings ; - $a = 1 if $a EQ $b ; + time ; { - use warnings FATAL => qw(deprecated) ; - $a = 1 if $a EQ $b ; + use warnings FATAL => qw(void) ; + length "abc" ; } - $a = 1 if $a EQ $b ; + join "", 1,2,3 ; + + print "done\n" ; + +When run it produces this output + + Useless use of time in void context at fatal line 3. + Useless use of length in void context at fatal line 7. + +The scope where C<length> is used has escalated the C<void> warnings +category into a fatal error, so the program terminates immediately it +encounters the warning. + =head2 Reporting Warnings from a Module The C<warnings> pragma provides a number of functions that are useful for module authors. These are used when you want to report a module-specific -warning when the calling module has enabled warnings via the C<warnings> +warning to a calling module has enabled warnings via the C<warnings> pragma. Consider the module C<MyMod::Abc> below. @@ -361,11 +371,11 @@ Consider the module C<MyMod::Abc> below. 1 ; The call to C<warnings::register> will create a new warnings category -called "MyMod::abc", i.e. the new category name matches the module -name. The C<open> function in the module will display a warning message -if it gets given a relative path as a parameter. This warnings will only -be displayed if the code that uses C<MyMod::Abc> has actually enabled -them with the C<warnings> pragma like below. +called "MyMod::abc", i.e. the new category name matches the current +package name. The C<open> function in the module will display a warning +message if it gets given a relative path as a parameter. This warnings +will only be displayed if the code that uses C<MyMod::Abc> has actually +enabled them with the C<warnings> pragma like below. use MyMod::Abc; use warnings 'MyMod::Abc'; @@ -379,10 +389,8 @@ this snippet of code: package MyMod::Abc; sub open { - if (warnings::enabled("deprecated")) { - warnings::warn("deprecated", - "open is deprecated, use new instead") ; - } + warnings::warnif("deprecated", + "open is deprecated, use new instead") ; new(@_) ; } @@ -399,18 +407,89 @@ display a warning message whenever the calling module has (at least) the ... MyMod::Abc::open($filename) ; -The C<warnings::warn> function should be used to actually display the -warnings message. This is because they can make use of the feature that -allows warnings to be escalated into fatal errors. So in this case +Either the C<warnings::warn> or C<warnings::warnif> function should be +used to actually display the warnings message. This is because they can +make use of the feature that allows warnings to be escalated into fatal +errors. So in this case use MyMod::Abc; use warnings FATAL => 'MyMod::Abc'; ... MyMod::Abc::open('../fred.txt'); -the C<warnings::warn> function will detect this and die after +the C<warnings::warnif> function will detect this and die after displaying the warning message. +The three warnings functions, C<warnings::warn>, C<warnings::warnif> +and C<warnings::enabled> can optionally take an object reference in place +of a category name. In this case the functions will use the class name +of the object as the warnings category. + +Consider this example: + + package Original ; + + no warnings ; + use warnings::register ; + + sub new + { + my $class = shift ; + bless [], $class ; + } + + sub check + { + my $self = shift ; + my $value = shift ; + + if ($value % 2 && warnings::enabled($self)) + { warnings::warn($self, "Odd numbers are unsafe") } + } + + sub doit + { + my $self = shift ; + my $value = shift ; + $self->check($value) ; + # ... + } + + 1 ; + + package Derived ; + + use warnings::register ; + use Original ; + our @ISA = qw( Original ) ; + sub new + { + my $class = shift ; + bless [], $class ; + } + + + 1 ; + +The code below makes use of both modules, but it only enables warnings from +C<Derived>. + + use Original ; + use Derived ; + use warnings 'Derived'; + my $a = new Original ; + $a->doit(1) ; + my $b = new Derived ; + $a->doit(1) ; + +When this code is run only the C<Derived> object, C<$b>, will generate +a warning. + + Odd numbers are unsafe at main.pl line 7 + +Notice also that the warning is reported at the line where the object is first +used. + =head1 TODO perl5db.pl @@ -424,6 +503,8 @@ displaying the warning message. around the limitations of C<$^W>. Now that those limitations are gone, the module should be revisited. + document calling the warnings::* functions from XS + =head1 SEE ALSO L<warnings>, L<perldiag>. diff --git a/gnu/usr.bin/perl/pod/perlmodlib.PL b/gnu/usr.bin/perl/pod/perlmodlib.PL new file mode 100644 index 00000000000..0cdadb76c79 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlmodlib.PL @@ -0,0 +1,1383 @@ +#!../miniperl + +open (OUT, ">perlmodlib.tmp") or die $!; +my (@pragma, @mod); +open (MANIFEST, "../MANIFEST") or die $!; + +while (<MANIFEST>) { + my $filename; + next unless s|^lib/|| or m|^ext/|; + ($filename) = /(\S+)/; + $filename =~ s|^[^/]+/|| if $filename =~ s|^ext/||; + next unless $filename =~ /\.p(m|od)$/; + next unless open (MOD, "../lib/$filename"); + + my ($name, $thing); + my $foundit=0; + { + local $/=""; + while (<MOD>) { + next unless /^=head1 NAME/; + $foundit++; + last; + } + } + unless ($foundit) { + warn "$filename missing head1\n"; + next; + } + my $title = <MOD>; + chomp($title); + close MOD; + + my $perlname = $filename; + $perlname =~ s!\.p(m|od)$!!; + $perlname =~ s!/!::!g; + + ($name, $thing) = split / --? /, $title, 2; + + unless ($name and $thing) { + warn "$filename missing name\n" unless $name; + warn "$filename missing thing\n" unless $thing; + next; + } + + $thing =~ s/^perl pragma to //i; + $thing = ucfirst($thing); + $title = "=item $perlname\n\n$thing\n\n"; + + # print "$perlname $thing\n"; + + if ($filename=~/[A-Z]/) { + push @mod, $title; + } else { + push @pragma, $title; + } +} + +print OUT <<'EOF'; +# Generated by perlmodlib.PL DO NOT EDIT! + +=head1 NAME + +perlmodlib - constructing new Perl modules and finding existing ones + +=head1 DESCRIPTION + +=head1 THE PERL MODULE LIBRARY + +Many modules are included the Perl distribution. These are described +below, and all end in F<.pm>. You may discover compiled library +file (usually ending in F<.so>) or small pieces of modules to be +autoloaded (ending in F<.al>); these were automatically generated +by the installation process. You may also discover files in the +library directory that end in either F<.pl> or F<.ph>. These are +old libraries supplied so that old programs that use them still +run. The F<.pl> files will all eventually be converted into standard +modules, and the F<.ph> files made by B<h2ph> will probably end up +as extension modules made by B<h2xs>. (Some F<.ph> values may +already be available through the POSIX, Errno, or Fcntl modules.) +The B<pl2pm> file in the distribution may help in your conversion, +but it's just a mechanical process and therefore far from bulletproof. + +=head2 Pragmatic Modules + +They work somewhat like compiler directives (pragmata) in that they +tend to affect the compilation of your program, and thus will usually +work well only when used within a C<use>, or C<no>. Most of these +are lexically scoped, so an inner BLOCK may countermand them +by saying: + + no integer; + no strict 'refs'; + no warnings; + +which lasts until the end of that BLOCK. + +Some pragmas are lexically scoped--typically those that affect the +C<$^H> hints variable. Others affect the current package instead, +like C<use vars> and C<use subs>, which allow you to predeclare a +variables or subroutines within a particular I<file> rather than +just a block. Such declarations are effective for the entire file +for which they were declared. You cannot rescind them with C<no +vars> or C<no subs>. + +The following pragmas are defined (and have their own documentation). + +=over 12 + +EOF + +print OUT $_ for (sort @pragma); + +print OUT <<EOF; +=back + +=head2 Standard Modules + +Standard, bundled modules are all expected to behave in a well-defined +manner with respect to namespace pollution because they use the +Exporter module. See their own documentation for details. + +=over 12 + +EOF + +print OUT $_ for (sort @mod); + +print OUT <<'EOF'; +=back + +To find out I<all> modules installed on your system, including +those without documentation or outside the standard release, +just do this: + + % find `perl -e 'print "@INC"'` -name '*.pm' -print + +They should all have their own documentation installed and accessible +via your system man(1) command. If you do not have a B<find> +program, you can use the Perl B<find2perl> program instead, which +generates Perl code as output you can run through perl. If you +have a B<man> program but it doesn't find your modules, you'll have +to fix your manpath. See L<perl> for details. If you have no +system B<man> command, you might try the B<perldoc> program. + +=head2 Extension Modules + +Extension modules are written in C (or a mix of Perl and C). They +are usually dynamically loaded into Perl if and when you need them, +but may also be be linked in statically. Supported extension modules +include Socket, Fcntl, and POSIX. + +Many popular C extension modules do not come bundled (at least, not +completely) due to their sizes, volatility, or simply lack of time +for adequate testing and configuration across the multitude of +platforms on which Perl was beta-tested. You are encouraged to +look for them on CPAN (described below), or using web search engines +like Alta Vista or Deja News. + +=head1 CPAN + +CPAN stands for Comprehensive Perl Archive Network; it's a globally +replicated trove of Perl materials, including documentation, style +guides, tricks and traps, alternate ports to non-Unix systems and +occasional binary distributions for these. Search engines for +CPAN can be found at http://cpan.perl.com/ and at +http://theory.uwinnipeg.ca/mod_perl/cpan-search.pl . + +Most importantly, CPAN includes around a thousand unbundled modules, +some of which require a C compiler to build. Major categories of +modules are: + +=over + +=item * + +Language Extensions and Documentation Tools + +=item * + +Development Support + +=item * + +Operating System Interfaces + +=item * + +Networking, Device Control (modems) and InterProcess Communication + +=item * + +Data Types and Data Type Utilities + +=item * + +Database Interfaces + +=item * + +User Interfaces + +=item * + +Interfaces to / Emulations of Other Programming Languages + +=item * + +File Names, File Systems and File Locking (see also File Handles) + +=item * + +String Processing, Language Text Processing, Parsing, and Searching + +=item * + +Option, Argument, Parameter, and Configuration File Processing + +=item * + +Internationalization and Locale + +=item * + +Authentication, Security, and Encryption + +=item * + +World Wide Web, HTML, HTTP, CGI, MIME + +=item * + +Server and Daemon Utilities + +=item * + +Archiving and Compression + +=item * + +Images, Pixmap and Bitmap Manipulation, Drawing, and Graphing + +=item * + +Mail and Usenet News + +=item * + +Control Flow Utilities (callbacks and exceptions etc) + +=item * + +File Handle and Input/Output Stream Utilities + +=item * + +Miscellaneous Modules + +=back + +Registered CPAN sites as of this writing include the following. +You should try to choose one close to you: + +=head2 Africa + +=over 4 + +=item * + +South Africa + + ftp://ftp.is.co.za/programming/perl/CPAN/ + ftp://ftp.saix.net/pub/CPAN/ + ftp://ftpza.co.za/pub/mirrors/cpan/ + ftp://ftp.sun.ac.za/CPAN/ + +=back + +=head2 Asia + +=over 4 + +=item * + +China + + ftp://freesoft.cei.gov.cn/pub/languages/perl/CPAN/ + http://www2.linuxforum.net/mirror/CPAN/ + http://cpan.shellhung.org/ + ftp://ftp.shellhung.org/pub/CPAN + +=item * + +Hong Kong + + http://CPAN.pacific.net.hk/ + ftp://ftp.pacific.net.hk/pub/mirror/CPAN/ + +=item * + +Indonesia + + http://piksi.itb.ac.id/CPAN/ + ftp://mirrors.piksi.itb.ac.id/CPAN/ + http://CPAN.mweb.co.id/ + ftp://ftp.mweb.co.id/pub/languages/perl/CPAN/ + +=item * + +Israel + + http://www.iglu.org.il:/pub/CPAN/ + ftp://ftp.iglu.org.il/pub/CPAN/ + http://bioinfo.weizmann.ac.il/pub/software/perl/CPAN/ + ftp://bioinfo.weizmann.ac.il/pub/software/perl/CPAN/ + +=item * + +Japan + + ftp://ftp.u-aizu.ac.jp/pub/lang/perl/CPAN/ + ftp://ftp.kddlabs.co.jp/CPAN/ + http://mirror.nucba.ac.jp/mirror/Perl/ + ftp://mirror.nucba.ac.jp/mirror/Perl/ + ftp://ftp.meisei-u.ac.jp/pub/CPAN/ + ftp://ftp.jaist.ac.jp/pub/lang/perl/CPAN/ + ftp://ftp.dti.ad.jp/pub/lang/CPAN/ + ftp://ftp.ring.gr.jp/pub/lang/perl/CPAN/ + +=item * + +Saudi Arabia + + ftp://ftp.isu.net.sa/pub/CPAN/ + +=item * + +Singapore + + http://cpan.hjc.edu.sg + http://ftp.nus.edu.sg/unix/perl/CPAN/ + ftp://ftp.nus.edu.sg/pub/unix/perl/CPAN/ + +=item * + +South Korea + + http://CPAN.bora.net/ + ftp://ftp.bora.net/pub/CPAN/ + http://ftp.kornet.net/CPAN/ + ftp://ftp.kornet.net/pub/CPAN/ + ftp://ftp.nuri.net/pub/CPAN/ + +=item * + +Taiwan + + ftp://coda.nctu.edu.tw/UNIX/perl/CPAN + ftp://ftp.ee.ncku.edu.tw/pub/perl/CPAN/ + ftp://ftp1.sinica.edu.tw/pub1/perl/CPAN/ + +=item * + +Thailand + + http://download.nectec.or.th/CPAN/ + ftp://ftp.nectec.or.th/pub/languages/CPAN/ + ftp://ftp.cs.riubon.ac.th/pub/mirrors/CPAN/ + +=back + +=head2 Central America + +=over 4 + +=item * + +Costa Rica + + ftp://ftp.linux.co.cr/mirrors/CPAN/ + http://ftp.ucr.ac.cr/Unix/CPAN/ + ftp://ftp.ucr.ac.cr/pub/Unix/CPAN/ + +=back + +=head2 Europe + +=over 4 + +=item * + +Austria + + ftp://ftp.tuwien.ac.at/pub/languages/perl/CPAN/ + +=item * + +Belgium + + http://ftp.easynet.be/CPAN/ + ftp://ftp.easynet.be/CPAN/ + ftp://ftp.kulnet.kuleuven.ac.be/pub/mirror/CPAN/ + +=item * + +Bulgaria + + ftp://ftp.ntrl.net/pub/mirrors/CPAN/ + +=item * + +Croatia + + ftp://ftp.linux.hr/pub/CPAN/ + +=item * + +Czech Republic + + http://www.fi.muni.cz/pub/perl/ + ftp://ftp.fi.muni.cz/pub/perl/ + ftp://sunsite.mff.cuni.cz/MIRRORS/ftp.funet.fi/pub/languages/perl/CPAN/ + +=item * + +Denmark + + ftp://sunsite.auc.dk/pub/languages/perl/CPAN/ + http://www.cpan.dk/CPAN/ + ftp://www.cpan.dk/ftp.cpan.org/CPAN/ + +=item * + +England + + http://www.mirror.ac.uk/sites/ftp.funet.fi/pub/languages/perl/CPAN + ftp://ftp.mirror.ac.uk/sites/ftp.funet.fi/pub/languages/perl/CPAN/ + ftp://ftp.demon.co.uk/pub/mirrors/perl/CPAN/ + ftp://ftp.flirble.org/pub/languages/perl/CPAN/ + ftp://ftp.plig.org/pub/CPAN/ + ftp://sunsite.doc.ic.ac.uk/packages/CPAN/ + http://mirror.uklinux.net/CPAN/ + ftp://mirror.uklinux.net/pub/CPAN/ + ftp://usit.shef.ac.uk/pub/packages/CPAN/ + +=item * + +Estonia + + ftp://ftp.ut.ee/pub/languages/perl/CPAN/ + +=item * + +Finland + + ftp://ftp.funet.fi/pub/languages/perl/CPAN/ + +=item * + +France + + ftp://cpan.ftp.worldonline.fr/pub/CPAN/ + ftp://ftp.club-internet.fr/pub/perl/CPAN/ + ftp://ftp.lip6.fr/pub/perl/CPAN/ + ftp://ftp.oleane.net/pub/mirrors/CPAN/ + ftp://ftp.pasteur.fr/pub/computing/CPAN/ + ftp://cpan.cict.fr/pub/CPAN/ + ftp://ftp.uvsq.fr/pub/perl/CPAN/ + +=item * + +Germany + + ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN/ + ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/ + ftp://ftp.uni-erlangen.de/pub/source/CPAN/ + ftp://ftp-stud.fht-esslingen.de/pub/Mirrors/CPAN + ftp://ftp.gigabell.net/pub/CPAN/ + http://ftp.gwdg.de/pub/languages/perl/CPAN/ + ftp://ftp.gwdg.de/pub/languages/perl/CPAN/ + ftp://ftp.uni-hamburg.de/pub/soft/lang/perl/CPAN/ + ftp://ftp.leo.org/pub/comp/general/programming/languages/script/perl/CPAN/ + ftp://ftp.mpi-sb.mpg.de/pub/perl/CPAN/ + ftp://ftp.gmd.de/mirrors/CPAN/ + +=item * + +Greece + + ftp://ftp.forthnet.gr/pub/languages/perl/CPAN + ftp://ftp.ntua.gr/pub/lang/perl/ + +=item * + +Hungary + + http://cpan.artifact.hu/ + ftp://cpan.artifact.hu/CPAN/ + ftp://ftp.kfki.hu/pub/packages/perl/CPAN/ + +=item * + +Iceland + + http://cpan.gm.is/ + ftp://ftp.gm.is/pub/CPAN/ + +=item * + +Ireland + + http://cpan.indigo.ie/ + ftp://cpan.indigo.ie/pub/CPAN/ + http://sunsite.compapp.dcu.ie/pub/perl/ + ftp://sunsite.compapp.dcu.ie/pub/perl/ + +=item * + +Italy + + http://cpan.nettuno.it/ + http://gusp.dyndns.org/CPAN/ + ftp://gusp.dyndns.org/pub/CPAN + http://softcity.iol.it/cpan + ftp://softcity.iol.it/pub/cpan + ftp://ftp.unina.it/pub/Other/CPAN/ + ftp://ftp.unipi.it/pub/mirror/perl/CPAN/ + ftp://cis.uniRoma2.it/CPAN/ + ftp://ftp.edisontel.it/pub/CPAN_Mirror/ + ftp://ftp.flashnet.it/pub/CPAN/ + +=item * + +Latvia + + http://kvin.lv/pub/CPAN/ + +=item * + +Netherlands + + ftp://download.xs4all.nl/pub/mirror/CPAN/ + ftp://ftp.nl.uu.net/pub/CPAN/ + ftp://ftp.nluug.nl/pub/languages/perl/CPAN/ + ftp://ftp.cpan.nl/pub/CPAN/ + http://www.cs.uu.nl/mirror/CPAN/ + ftp://ftp.cs.uu.nl/mirror/CPAN/ + +=item * + +Norway + + ftp://sunsite.uio.no/pub/languages/perl/CPAN/ + ftp://ftp.uit.no/pub/languages/perl/cpan/ + +=item * + +Poland + + ftp://ftp.pk.edu.pl/pub/lang/perl/CPAN/ + ftp://ftp.mega.net.pl/pub/mirrors/ftp.perl.com/ + ftp://ftp.man.torun.pl/pub/doc/CPAN/ + ftp://sunsite.icm.edu.pl/pub/CPAN/ + +=item * + +Portugal + + ftp://ftp.ua.pt/pub/CPAN/ + ftp://perl.di.uminho.pt/pub/CPAN/ + ftp://ftp.ist.utl.pt/pub/CPAN/ + ftp://ftp.netc.pt/pub/CPAN/ + +=item * + +Romania + + ftp://archive.logicnet.ro/mirrors/ftp.cpan.org/CPAN/ + ftp://ftp.kappa.ro/pub/mirrors/ftp.perl.org/pub/CPAN/ + ftp://ftp.dntis.ro/pub/cpan/ + ftp://ftp.opsynet.com/cpan/ + ftp://ftp.dnttm.ro/pub/CPAN/ + ftp://ftp.timisoara.roedu.net/mirrors/CPAN/ + +=item * + +Russia + + ftp://ftp.chg.ru/pub/lang/perl/CPAN/ + http://cpan.rinet.ru/ + ftp://cpan.rinet.ru/pub/mirror/CPAN/ + ftp://ftp.aha.ru/pub/CPAN/ + ftp://ftp.sai.msu.su/pub/lang/perl/CPAN/ + +=item * + +Slovakia + + ftp://ftp.entry.sk/pub/languages/perl/CPAN/ + +=item * + +Slovenia + + ftp://ftp.arnes.si/software/perl/CPAN/ + +=item * + +Spain + + ftp://ftp.rediris.es/mirror/CPAN/ + ftp://ftp.etse.urv.es/pub/perl/ + +=item * + +Sweden + + http://ftp.du.se/CPAN/ + ftp://ftp.du.se/pub/CPAN/ + ftp://ftp.sunet.se/pub/lang/perl/CPAN/ + +=item * + +Switzerland + + ftp://ftp.danyk.ch/CPAN/ + ftp://sunsite.cnlab-switch.ch/mirror/CPAN/ + +=item * + +Turkey + + ftp://sunsite.bilkent.edu.tr/pub/languages/CPAN/ + +=back + +=head2 North America + +=over 4 + +=item * + +Canada + +=over 8 + +=item * + +Alberta + + http://sunsite.ualberta.ca/pub/Mirror/CPAN/ + ftp://sunsite.ualberta.ca/pub/Mirror/CPAN/ + +=item * + +Manitoba + + http://theoryx5.uwinnipeg.ca/pub/CPAN/ + ftp://theoryx5.uwinnipeg.ca/pub/CPAN/ + +=item * + +Nova Scotia + + ftp://cpan.chebucto.ns.ca/pub/CPAN/ + +=item * + +Ontario + + ftp://ftp.crc.ca/pub/packages/lang/perl/CPAN/ + +=item * + +Mexico + + http://www.msg.com.mx/CPAN/ + ftp://ftp.msg.com.mx/pub/CPAN/ + +=back + +=item * + +United States + +=over 8 + +=item * + +Alabama + + http://mirror.hiwaay.net/CPAN/ + ftp://mirror.hiwaay.net/CPAN/ + +=item * + +California + + http://www.cpan.org/ + ftp://ftp.cpan.org/CPAN/ + ftp://cpan.nas.nasa.gov/pub/perl/CPAN/ + ftp://ftp.digital.com/pub/plan/perl/CPAN/ + http://www.kernel.org/pub/mirrors/cpan/ + ftp://ftp.kernel.org/pub/mirrors/cpan/ + http://www.perl.com/CPAN/ + http://download.sourceforge.net/mirrors/CPAN/ + +=item * + +Colorado + + ftp://ftp.cs.colorado.edu/pub/perl/CPAN/ + +=item * + +Florida + + ftp://ftp.cise.ufl.edu/pub/perl/CPAN/ + +=item * + +Georgia + + ftp://ftp.twoguys.org/CPAN/ + +=item * + +Illinois + + http://www.neurogames.com/mirrors/CPAN + http://uiarchive.uiuc.edu/mirrors/ftp/ftp.cpan.org/pub/CPAN/ + ftp://uiarchive.uiuc.edu/mirrors/ftp/ftp.cpan.org/pub/CPAN/ + +=item * + +Indiana + + ftp://ftp.uwsg.indiana.edu/pub/perl/CPAN/ + http://cpan.nitco.com/ + ftp://cpan.nitco.com/pub/CPAN/ + ftp://cpan.in-span.net/ + http://csociety-ftp.ecn.purdue.edu/pub/CPAN + ftp://csociety-ftp.ecn.purdue.edu/pub/CPAN + +=item * + +Kentucky + + http://cpan.uky.edu/ + ftp://cpan.uky.edu/pub/CPAN/ + +=item * + +Massachusetts + + ftp://ftp.ccs.neu.edu/net/mirrors/ftp.funet.fi/pub/languages/perl/CPAN/ + ftp://ftp.iguide.com/pub/mirrors/packages/perl/CPAN/ + +=item * + +New Jersey + + ftp://ftp.cpanel.net/pub/CPAN/ + +=item * + +New York + + ftp://ftp.freesoftware.com/pub/perl/CPAN/ + http://www.deao.net/mirrors/CPAN/ + ftp://ftp.deao.net/pub/CPAN/ + ftp://ftp.stealth.net/pub/mirrors/ftp.cpan.org/pub/CPAN/ + http://mirror.nyc.anidea.com/CPAN/ + ftp://mirror.nyc.anidea.com/pub/CPAN/ + http://www.rge.com/pub/languages/perl/ + ftp://ftp.rge.com/pub/languages/perl/ + ftp://mirrors.cloud9.net/pub/mirrors/CPAN/ + +=item * + +North Carolina + + ftp://ftp.duke.edu/pub/perl/ + +=item * + +Ohio + + ftp://ftp.loaded.net/pub/CPAN/ + +=item * + +Oklahoma + + ftp://ftp.ou.edu/mirrors/CPAN/ + +=item * + +Oregon + + ftp://ftp.orst.edu/pub/packages/CPAN/ + +=item * + +Pennsylvania + + http://ftp.epix.net/CPAN/ + ftp://ftp.epix.net/pub/languages/perl/ + ftp://carroll.cac.psu.edu/pub/CPAN/ + +=item * + +Tennessee + + ftp://ftp.sunsite.utk.edu/pub/CPAN/ + +=item * + +Texas + + http://ftp.sedl.org/pub/mirrors/CPAN/ + http://jhcloos.com/pub/mirror/CPAN/ + ftp://jhcloos.com/pub/mirror/CPAN/ + +=item * + +Utah + + ftp://mirror.xmission.com/CPAN/ + +=item * + +Virginia + + http://mirrors.rcn.net/pub/lang/CPAN/ + ftp://mirrors.rcn.net/pub/lang/CPAN/ + ftp://ruff.cs.jmu.edu/pub/CPAN/ + http://perl.Liquidation.com/CPAN/ + +=item * + +Washington + + http://cpan.llarian.net/ + ftp://cpan.llarian.net/pub/CPAN/ + ftp://ftp-mirror.internap.com/pub/CPAN/ + ftp://ftp.spu.edu/pub/CPAN/ + +=back + +=back + +=head2 Oceania + +=over 4 + +=item * + +Australia + + http://ftp.planetmirror.com/pub/CPAN/ + ftp://ftp.planetmirror.com/pub/CPAN/ + ftp://mirror.aarnet.edu.au/pub/perl/CPAN/ + ftp://cpan.topend.com.au/pub/CPAN/ + +=item * + +New Zealand + + ftp://ftp.auckland.ac.nz/pub/perl/CPAN/ + +=back + +=head2 South America + +=over 4 + +=item * + +Argentina + + ftp://mirrors.bannerlandia.com.ar/mirrors/CPAN/ + +=item * + +Brazil + + ftp://cpan.pop-mg.com.br/pub/CPAN/ + ftp://ftp.matrix.com.br/pub/perl/ + ftp://cpan.if.usp.br/pub/mirror/CPAN/ + +=item * + +Chile + + ftp://ftp.psinet.cl/pub/programming/perl/CPAN/ + ftp://sunsite.dcc.uchile.cl/pub/lang/perl/ + +=back + +For an up-to-date listing of CPAN sites, +see http://www.cpan.org/SITES or ftp://www.cpan.org/SITES . + +=head1 Modules: Creation, Use, and Abuse + +(The following section is borrowed directly from Tim Bunce's modules +file, available at your nearest CPAN site.) + +Perl implements a class using a package, but the presence of a +package doesn't imply the presence of a class. A package is just a +namespace. A class is a package that provides subroutines that can be +used as methods. A method is just a subroutine that expects, as its +first argument, either the name of a package (for "static" methods), +or a reference to something (for "virtual" methods). + +A module is a file that (by convention) provides a class of the same +name (sans the .pm), plus an import method in that class that can be +called to fetch exported symbols. This module may implement some of +its methods by loading dynamic C or C++ objects, but that should be +totally transparent to the user of the module. Likewise, the module +might set up an AUTOLOAD function to slurp in subroutine definitions on +demand, but this is also transparent. Only the F<.pm> file is required to +exist. See L<perlsub>, L<perltoot>, and L<AutoLoader> for details about +the AUTOLOAD mechanism. + +=head2 Guidelines for Module Creation + +=over 4 + +=item * + +Do similar modules already exist in some form? + +If so, please try to reuse the existing modules either in whole or +by inheriting useful features into a new class. If this is not +practical try to get together with the module authors to work on +extending or enhancing the functionality of the existing modules. +A perfect example is the plethora of packages in perl4 for dealing +with command line options. + +If you are writing a module to expand an already existing set of +modules, please coordinate with the author of the package. It +helps if you follow the same naming scheme and module interaction +scheme as the original author. + +=item * + +Try to design the new module to be easy to extend and reuse. + +Try to C<use warnings;> (or C<use warnings qw(...);>). +Remember that you can add C<no warnings qw(...);> to individual blocks +of code that need less warnings. + +Use blessed references. Use the two argument form of bless to bless +into the class name given as the first parameter of the constructor, +e.g.,: + + sub new { + my $class = shift; + return bless {}, $class; + } + +or even this if you'd like it to be used as either a static +or a virtual method. + + sub new { + my $self = shift; + my $class = ref($self) || $self; + return bless {}, $class; + } + +Pass arrays as references so more parameters can be added later +(it's also faster). Convert functions into methods where +appropriate. Split large methods into smaller more flexible ones. +Inherit methods from other modules if appropriate. + +Avoid class name tests like: C<die "Invalid" unless ref $ref eq 'FOO'>. +Generally you can delete the C<eq 'FOO'> part with no harm at all. +Let the objects look after themselves! Generally, avoid hard-wired +class names as far as possible. + +Avoid C<< $r->Class::func() >> where using C<@ISA=qw(... Class ...)> and +C<< $r->func() >> would work (see L<perlbot> for more details). + +Use autosplit so little used or newly added functions won't be a +burden to programs that don't use them. Add test functions to +the module after __END__ either using AutoSplit or by saying: + + eval join('',<main::DATA>) || die $@ unless caller(); + +Does your module pass the 'empty subclass' test? If you say +C<@SUBCLASS::ISA = qw(YOURCLASS);> your applications should be able +to use SUBCLASS in exactly the same way as YOURCLASS. For example, +does your application still work if you change: C<$obj = new YOURCLASS;> +into: C<$obj = new SUBCLASS;> ? + +Avoid keeping any state information in your packages. It makes it +difficult for multiple other packages to use yours. Keep state +information in objects. + +Always use B<-w>. + +Try to C<use strict;> (or C<use strict qw(...);>). +Remember that you can add C<no strict qw(...);> to individual blocks +of code that need less strictness. + +Always use B<-w>. + +Follow the guidelines in the perlstyle(1) manual. + +Always use B<-w>. + +=item * + +Some simple style guidelines + +The perlstyle manual supplied with Perl has many helpful points. + +Coding style is a matter of personal taste. Many people evolve their +style over several years as they learn what helps them write and +maintain good code. Here's one set of assorted suggestions that +seem to be widely used by experienced developers: + +Use underscores to separate words. It is generally easier to read +$var_names_like_this than $VarNamesLikeThis, especially for +non-native speakers of English. It's also a simple rule that works +consistently with VAR_NAMES_LIKE_THIS. + +Package/Module names are an exception to this rule. Perl informally +reserves lowercase module names for 'pragma' modules like integer +and strict. Other modules normally begin with a capital letter and +use mixed case with no underscores (need to be short and portable). + +You may find it helpful to use letter case to indicate the scope +or nature of a variable. For example: + + $ALL_CAPS_HERE constants only (beware clashes with Perl vars) + $Some_Caps_Here package-wide global/static + $no_caps_here function scope my() or local() variables + +Function and method names seem to work best as all lowercase. +e.g., C<< $obj->as_string() >>. + +You can use a leading underscore to indicate that a variable or +function should not be used outside the package that defined it. + +=item * + +Select what to export. + +Do NOT export method names! + +Do NOT export anything else by default without a good reason! + +Exports pollute the namespace of the module user. If you must +export try to use @EXPORT_OK in preference to @EXPORT and avoid +short or common names to reduce the risk of name clashes. + +Generally anything not exported is still accessible from outside the +module using the ModuleName::item_name (or C<< $blessed_ref->method >>) +syntax. By convention you can use a leading underscore on names to +indicate informally that they are 'internal' and not for public use. + +(It is actually possible to get private functions by saying: +C<my $subref = sub { ... }; &$subref;>. But there's no way to call that +directly as a method, because a method must have a name in the symbol +table.) + +As a general rule, if the module is trying to be object oriented +then export nothing. If it's just a collection of functions then +@EXPORT_OK anything but use @EXPORT with caution. + +=item * + +Select a name for the module. + +This name should be as descriptive, accurate, and complete as +possible. Avoid any risk of ambiguity. Always try to use two or +more whole words. Generally the name should reflect what is special +about what the module does rather than how it does it. Please use +nested module names to group informally or categorize a module. +There should be a very good reason for a module not to have a nested name. +Module names should begin with a capital letter. + +Having 57 modules all called Sort will not make life easy for anyone +(though having 23 called Sort::Quick is only marginally better :-). +Imagine someone trying to install your module alongside many others. +If in any doubt ask for suggestions in comp.lang.perl.misc. + +If you are developing a suite of related modules/classes it's good +practice to use nested classes with a common prefix as this will +avoid namespace clashes. For example: Xyz::Control, Xyz::View, +Xyz::Model etc. Use the modules in this list as a naming guide. + +If adding a new module to a set, follow the original author's +standards for naming modules and the interface to methods in +those modules. + +If developing modules for private internal or project specific use, +that will never be released to the public, then you should ensure +that their names will not clash with any future public module. You +can do this either by using the reserved Local::* category or by +using a category name that includes an underscore like Foo_Corp::*. + +To be portable each component of a module name should be limited to +11 characters. If it might be used on MS-DOS then try to ensure each is +unique in the first 8 characters. Nested modules make this easier. + +=item * + +Have you got it right? + +How do you know that you've made the right decisions? Have you +picked an interface design that will cause problems later? Have +you picked the most appropriate name? Do you have any questions? + +The best way to know for sure, and pick up many helpful suggestions, +is to ask someone who knows. Comp.lang.perl.misc is read by just about +all the people who develop modules and it's the best place to ask. + +All you need to do is post a short summary of the module, its +purpose and interfaces. A few lines on each of the main methods is +probably enough. (If you post the whole module it might be ignored +by busy people - generally the very people you want to read it!) + +Don't worry about posting if you can't say when the module will be +ready - just say so in the message. It might be worth inviting +others to help you, they may be able to complete it for you! + +=item * + +README and other Additional Files. + +It's well known that software developers usually fully document the +software they write. If, however, the world is in urgent need of +your software and there is not enough time to write the full +documentation please at least provide a README file containing: + +=over 10 + +=item * + +A description of the module/package/extension etc. + +=item * + +A copyright notice - see below. + +=item * + +Prerequisites - what else you may need to have. + +=item * + +How to build it - possible changes to Makefile.PL etc. + +=item * + +How to install it. + +=item * + +Recent changes in this release, especially incompatibilities + +=item * + +Changes / enhancements you plan to make in the future. + +=back + +If the README file seems to be getting too large you may wish to +split out some of the sections into separate files: INSTALL, +Copying, ToDo etc. + +=over 4 + +=item Adding a Copyright Notice. + + +How you choose to license your work is a personal decision. +The general mechanism is to assert your Copyright and then make +a declaration of how others may copy/use/modify your work. + +Perl, for example, is supplied with two types of licence: The GNU +GPL and The Artistic Licence (see the files README, Copying, and +Artistic). Larry has good reasons for NOT just using the GNU GPL. + +My personal recommendation, out of respect for Larry, Perl, and the +Perl community at large is to state something simply like: + + Copyright (c) 1995 Your Name. All rights reserved. + This program is free software; you can redistribute it and/or + modify it under the same terms as Perl itself. + +This statement should at least appear in the README file. You may +also wish to include it in a Copying file and your source files. +Remember to include the other words in addition to the Copyright. + +=item * + +Give the module a version/issue/release number. + +To be fully compatible with the Exporter and MakeMaker modules you +should store your module's version number in a non-my package +variable called $VERSION. This should be a floating point +number with at least two digits after the decimal (i.e., hundredths, +e.g, C<$VERSION = "0.01">). Don't use a "1.3.2" style version. +See L<Exporter> for details. + +It may be handy to add a function or method to retrieve the number. +Use the number in announcements and archive file names when +releasing the module (ModuleName-1.02.tar.Z). +See perldoc ExtUtils::MakeMaker.pm for details. + +=item * + +How to release and distribute a module. + +It's good idea to post an announcement of the availability of your +module (or the module itself if small) to the comp.lang.perl.announce +Usenet newsgroup. This will at least ensure very wide once-off +distribution. + +If possible, register the module with CPAN. You should +include details of its location in your announcement. + +Some notes about ftp archives: Please use a long descriptive file +name that includes the version number. Most incoming directories +will not be readable/listable, i.e., you won't be able to see your +file after uploading it. Remember to send your email notification +message as soon as possible after uploading else your file may get +deleted automatically. Allow time for the file to be processed +and/or check the file has been processed before announcing its +location. + +FTP Archives for Perl Modules: + +Follow the instructions and links on: + + http://www.cpan.org/modules/00modlist.long.html + http://www.cpan.org/modules/04pause.html + +or upload to one of these sites: + + https://pause.kbx.de/pause/ + http://pause.perl.org/pause/ + +and notify <modules@perl.org>. + +By using the WWW interface you can ask the Upload Server to mirror +your modules from your ftp or WWW site into your own directory on +CPAN! + +Please remember to send me an updated entry for the Module list! + +=item * + +Take care when changing a released module. + +Always strive to remain compatible with previous released versions. +Otherwise try to add a mechanism to revert to the +old behavior if people rely on it. Document incompatible changes. + +=back + +=back + +=head2 Guidelines for Converting Perl 4 Library Scripts into Modules + +=over 4 + +=item * + +There is no requirement to convert anything. + +If it ain't broke, don't fix it! Perl 4 library scripts should +continue to work with no problems. You may need to make some minor +changes (like escaping non-array @'s in double quoted strings) but +there is no need to convert a .pl file into a Module for just that. + +=item * + +Consider the implications. + +All Perl applications that make use of the script will need to +be changed (slightly) if the script is converted into a module. Is +it worth it unless you plan to make other changes at the same time? + +=item * + +Make the most of the opportunity. + +If you are going to convert the script to a module you can use the +opportunity to redesign the interface. The guidelines for module +creation above include many of the issues you should consider. + +=item * + +The pl2pm utility will get you started. + +This utility will read *.pl files (given as parameters) and write +corresponding *.pm files. The pl2pm utilities does the following: + +=over 10 + +=item * + +Adds the standard Module prologue lines + +=item * + +Converts package specifiers from ' to :: + +=item * + +Converts die(...) to croak(...) + +=item * + +Several other minor changes + +=back + +Being a mechanical process pl2pm is not bullet proof. The converted +code will need careful checking, especially any package statements. +Don't delete the original .pl file till the new .pm one works! + +=back + +=head2 Guidelines for Reusing Application Code + +=over 4 + +=item * + +Complete applications rarely belong in the Perl Module Library. + +=item * + +Many applications contain some Perl code that could be reused. + +Help save the world! Share your code in a form that makes it easy +to reuse. + +=item * + +Break-out the reusable code into one or more separate module files. + +=item * + +Take the opportunity to reconsider and redesign the interfaces. + +=item * + +In some cases the 'application' can then be reduced to a small + +fragment of code built on top of the reusable modules. In these cases +the application could invoked as: + + % perl -e 'use Module::Name; method(@ARGV)' ... +or + % perl -mModule::Name ... (in perl5.002 or higher) + +=back + +=head1 NOTE + +Perl does not enforce private and public parts of its modules as you may +have been used to in other languages like C++, Ada, or Modula-17. Perl +doesn't have an infatuation with enforced privacy. It would prefer +that you stayed out of its living room because you weren't invited, not +because it has a shotgun. + +The module and its user have a contract, part of which is common law, +and part of which is "written". Part of the common law contract is +that a module doesn't pollute any namespace it wasn't asked to. The +written contract for the module (A.K.A. documentation) may make other +provisions. But then you know when you C<use RedefineTheWorld> that +you're redefining the world and willing to take the consequences. +EOF + +close MANIFEST or warn "$0: failed to close MANIFEST (../MANIFEST): $!"; +close OUT or warn "$0: failed to close OUT (perlmodlib.tmp): $!"; + diff --git a/gnu/usr.bin/perl/pod/perlnewmod.pod b/gnu/usr.bin/perl/pod/perlnewmod.pod new file mode 100644 index 00000000000..ace8d85130f --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlnewmod.pod @@ -0,0 +1,282 @@ +=head1 NAME + +perlnewmod - preparing a new module for distribution + +=head1 DESCRIPTION + +This document gives you some suggestions about how to go about writing +Perl modules, preparing them for distribution, and making them available +via CPAN. + +One of the things that makes Perl really powerful is the fact that Perl +hackers tend to want to share the solutions to problems they've faced, +so you and I don't have to battle with the same problem again. + +The main way they do this is by abstracting the solution into a Perl +module. If you don't know what one of these is, the rest of this +document isn't going to be much use to you. You're also missing out on +an awful lot of useful code; consider having a look at L<perlmod>, +L<perlmodlib> and L<perlmodinstall> before coming back here. + +When you've found that there isn't a module available for what you're +trying to do, and you've had to write the code yourself, consider +packaging up the solution into a module and uploading it to CPAN so that +others can benefit. + +=head2 Warning + +We're going to primarily concentrate on Perl-only modules here, rather +than XS modules. XS modules serve a rather different purpose, and +you should consider different things before distributing them - the +popularity of the library you are gluing, the portability to other +operating systems, and so on. However, the notes on preparing the Perl +side of the module and packaging and distributing it will apply equally +well to an XS module as a pure-Perl one. + +=head2 What should I make into a module? + +You should make a module out of any code that you think is going to be +useful to others. Anything that's likely to fill a hole in the communal +library and which someone else can slot directly into their program. Any +part of your code which you can isolate and extract and plug into +something else is a likely candidate. + +Let's take an example. Suppose you're reading in data from a local +format into a hash-of-hashes in Perl, turning that into a tree, walking +the tree and then piping each node to an Acme Transmogrifier Server. + +Now, quite a few people have the Acme Transmogrifier, and you've had to +write something to talk the protocol from scratch - you'd almost +certainly want to make that into a module. The level at which you pitch +it is up to you: you might want protocol-level modules analogous to +L<Net::SMTP|Net::SMTP> which then talk to higher level modules analogous +to L<Mail::Send|Mail::Send>. The choice is yours, but you do want to get +a module out for that server protocol. + +Nobody else on the planet is going to talk your local data format, so we +can ignore that. But what about the thing in the middle? Building tree +structures from Perl variables and then traversing them is a nice, +general problem, and if nobody's already written a module that does +that, you might want to modularise that code too. + +So hopefully you've now got a few ideas about what's good to modularise. +Let's now see how it's done. + +=head2 Step-by-step: Preparing the ground + +Before we even start scraping out the code, there are a few things we'll +want to do in advance. + +=over 3 + +=item Look around + +Dig into a bunch of modules to see how they're written. I'd suggest +starting with L<Text::Tabs|Text::Tabs>, since it's in the standard +library and is nice and simple, and then looking at something like +L<Time::Zone|Time::Zone>, L<File::Copy|File::Copy> and then some of the +C<Mail::*> modules if you're planning on writing object oriented code. + +These should give you an overall feel for how modules are laid out and +written. + +=item Check it's new + +There are a lot of modules on CPAN, and it's easy to miss one that's +similar to what you're planning on contributing. Have a good plough +through the modules list and the F<by-module> directories, and make sure +you're not the one reinventing the wheel! + +=item Discuss the need + +You might love it. You might feel that everyone else needs it. But there +might not actually be any real demand for it out there. If you're unsure +about the demand you're module will have, consider sending out feelers +on the C<comp.lang.perl.modules> newsgroup, or as a last resort, ask the +modules list at C<modules@perl.org>. Remember that this is a closed list +with a very long turn-around time - be prepared to wait a good while for +a response from them. + +=item Choose a name + +Perl modules included on CPAN have a naming hierarchy you should try to +fit in with. See L<perlmodlib> for more details on how this works, and +browse around CPAN and the modules list to get a feel of it. At the very +least, remember this: modules should be title capitalised, (This::Thing) +fit in with a category, and explain their purpose succinctly. + +=item Check again + +While you're doing that, make really sure you haven't missed a module +similar to the one you're about to write. + +When you've got your name sorted out and you're sure that your module is +wanted and not currently available, it's time to start coding. + +=back + +=head2 Step-by-step: Making the module + +=over 3 + +=item Start with F<h2xs> + +Originally a utility to convert C header files into XS modules, +L<h2xs|h2xs> has become a useful utility for churning out skeletons for +Perl-only modules as well. If you don't want to use the +L<Autoloader|Autoloader> which splits up big modules into smaller +subroutine-sized chunks, you'll say something like this: + + h2xs -AX -n Net::Acme + +The C<-A> omits the Autoloader code, C<-X> omits XS elements, and C<-n> +specifies the name of the module. + +=item Use L<strict|strict> and L<warnings|warnings> + +A module's code has to be warning and strict-clean, since you can't +guarantee the conditions that it'll be used under. Besides, you wouldn't +want to distribute code that wasn't warning or strict-clean anyway, +right? + +=item Use L<Carp|Carp> + +The L<Carp|Carp> module allows you to present your error messages from +the caller's perspective; this gives you a way to signal a problem with +the caller and not your module. For instance, if you say this: + + warn "No hostname given"; + +the user will see something like this: + + No hostname given at /usr/local/lib/perl5/site_perl/5.6.0/Net/Acme.pm + line 123. + +which looks like your module is doing something wrong. Instead, you want +to put the blame on the user, and say this: + + No hostname given at bad_code, line 10. + +You do this by using L<Carp|Carp> and replacing your C<warn>s with +C<carp>s. If you need to C<die>, say C<croak> instead. However, keep +C<warn> and C<die> in place for your sanity checks - where it really is +your module at fault. + +=item Use L<Exporter|Exporter> - wisely! + +C<h2xs> provides stubs for L<Exporter|Exporter>, which gives you a +standard way of exporting symbols and subroutines from your module into +the caller's namespace. For instance, saying C<use Net::Acme qw(&frob)> +would import the C<frob> subroutine. + +The package variable C<@EXPORT> will determine which symbols will get +exported when the caller simply says C<use Net::Acme> - you will hardly +ever want to put anything in there. C<@EXPORT_OK>, on the other hand, +specifies which symbols you're willing to export. If you do want to +export a bunch of symbols, use the C<%EXPORT_TAGS> and define a standard +export set - look at L<Exporter> for more details. + +=item Use L<plain old documentation|perlpod> + +The work isn't over until the paperwork is done, and you're going to +need to put in some time writing some documentation for your module. +C<h2xs> will provide a stub for you to fill in; if you're not sure about +the format, look at L<perlpod> for an introduction. Provide a good +synopsis of how your module is used in code, a description, and then +notes on the syntax and function of the individual subroutines or +methods. Use Perl comments for developer notes and POD for end-user +notes. + +=item Write tests + +You're encouraged to create self-tests for your module to ensure it's +working as intended on the myriad platforms Perl supports; if you upload +your module to CPAN, a host of testers will build your module and send +you the results of the tests. Again, C<h2xs> provides a test framework +which you can extend - you should do something more than just checking +your module will compile. + +=item Write the README + +If you're uploading to CPAN, the automated gremlins will extract the +README file and place that in your CPAN directory. It'll also appear in +the main F<by-module> and F<by-category> directories if you make it onto +the modules list. It's a good idea to put here what the module actually +does in detail, and the user-visible changes since the last release. + +=back + +=head2 Step-by-step: Distributing your module + +=over 3 + +=item Get a CPAN user ID + +Every developer publishing modules on CPAN needs a CPAN ID. See the +instructions at C<http://www.cpan.org/modules/04pause.html> (or +equivalent on your nearest mirror) to find out how to do this. + +=item C<perl Makefile.PL; make test; make dist> + +Once again, C<h2xs> has done all the work for you. It produces the +standard C<Makefile.PL> you'll have seen when you downloaded and +installs modules, and this produces a Makefile with a C<dist> target. + +Once you've ensured that your module passes its own tests - always a +good thing to make sure - you can C<make dist>, and the Makefile will +hopefully produce you a nice tarball of your module, ready for upload. + +=item Upload the tarball + +The email you got when you received your CPAN ID will tell you how to +log in to PAUSE, the Perl Authors Upload SErver. From the menus there, +you can upload your module to CPAN. + +=item Announce to the modules list + +Once uploaded, it'll sit unnoticed in your author directory. If you want +it connected to the rest of the CPAN, you'll need to tell the modules +list about it. The best way to do this is to email them a line in the +style of the modules list, like this: + + Net::Acme bdpO Interface to Acme Frobnicator servers FOOBAR + ^ ^^^^ ^ ^ + | |||| Module description Your ID + | |||| + | |||\- Interface: (O)OP, (r)eferences, (h)ybrid, (f)unctions + | ||| + | ||\-- Language: (p)ure Perl, C(+)+, (h)ybrid, (C), (o)ther + | || + Module |\--- Support: (d)eveloper, (m)ailing list, (u)senet, (n)one + Name | + \---- Maturity: (i)dea, (c)onstructions, (a)lpha, (b)eta, + (R)eleased, (M)ature, (S)tandard + +plus a description of the module and why you think it should be +included. If you hear nothing back, that means your module will +probably appear on the modules list at the next update. Don't try +subscribing to C<modules@perl.org>; it's not another mailing list. Just +have patience. + +=item Announce to clpa + +If you have a burning desire to tell the world about your release, post +an announcement to the moderated C<comp.lang.perl.announce> newsgroup. + +=item Fix bugs! + +Once you start accumulating users, they'll send you bug reports. If +you're lucky, they'll even send you patches. Welcome to the joys of +maintaining a software project... + +=back + +=head1 AUTHOR + +Simon Cozens, C<simon@cpan.org> + +=head1 SEE ALSO + +L<perlmod>, L<perlmodlib>, L<perlmodinstall>, L<h2xs>, L<strict>, +L<Carp>, L<Exporter>, L<perlpod>, L<Test>, L<ExtUtils::MakeMaker>, +http://www.cpan.org/ diff --git a/gnu/usr.bin/perl/pod/perlnumber.pod b/gnu/usr.bin/perl/pod/perlnumber.pod index c83e053203d..44d921cfe63 100644 --- a/gnu/usr.bin/perl/pod/perlnumber.pod +++ b/gnu/usr.bin/perl/pod/perlnumber.pod @@ -39,7 +39,7 @@ the maximal and the minimal supported true integral quantities are close to powers of 2. However, "native" floats have a most fundamental restriction: they may represent only those numbers which have a relatively "short" representation when converted to a binary fraction. For example, -0.9 cannot be respresented by a native float, since the binary fraction +0.9 cannot be represented by a native float, since the binary fraction for 0.9 is infinite: binary0.1110011001100... @@ -59,7 +59,7 @@ finite decimal expansion. Being strings, and thus of arbitrary length, there is no practical limit for the exponent or number of decimal digits for these numbers. (But realize that what we are discussing the rules for just the I<storage> of these numbers. The fact that you can store such "large" numbers -does not mean that that the I<operations> over these numbers will use all +does not mean that the I<operations> over these numbers will use all of the significant digits. See L<"Numeric operators and numeric conversions"> for details.) @@ -91,7 +91,7 @@ Six such conversions are possible: These conversions are governed by the following general rules: -=over +=over 4 =item * @@ -141,7 +141,7 @@ argument as in modular arithmetic, e.g., C<mod 2**32> on a 32-bit architecture. C<sprintf "%u", -1> therefore provides the same result as C<sprintf "%u", ~0>. -=over +=over 4 =item Arithmetic operators except, C<no integer> diff --git a/gnu/usr.bin/perl/pod/perlopentut.pod b/gnu/usr.bin/perl/pod/perlopentut.pod index 9cb9f6738a7..b4003f4f2ef 100644 --- a/gnu/usr.bin/perl/pod/perlopentut.pod +++ b/gnu/usr.bin/perl/pod/perlopentut.pod @@ -73,8 +73,8 @@ from a different file, and forget to trim it before opening: This is not a bug, but a feature. Because C<open> mimics the shell in its style of using redirection arrows to specify how to open the file, it also does so with respect to extra white space around the filename itself -as well. For accessing files with naughty names, see L<"Dispelling -the Dweomer">. +as well. For accessing files with naughty names, see +L<"Dispelling the Dweomer">. =head2 Pipe Opens @@ -107,13 +107,13 @@ In most systems, such an C<open> will not return an error. That's because in the traditional C<fork>/C<exec> model, running the other program happens only in the forked child process, which means that the failed C<exec> can't be reflected in the return value of C<open>. -Only a failed C<fork> shows up there. See L<perlfaq8/"Why doesn't open() -return an error when a pipe open fails?"> to see how to cope with this. -There's also an explanation in L<perlipc>. +Only a failed C<fork> shows up there. See +L<perlfaq8/"Why doesn't open() return an error when a pipe open fails?"> +to see how to cope with this. There's also an explanation in L<perlipc>. If you would like to open a bidirectional pipe, the IPC::Open2 -library will handle this for you. Check out L<perlipc/"Bidirectional -Communication with Another Process"> +library will handle this for you. Check out +L<perlipc/"Bidirectional Communication with Another Process"> =head2 The Minus File @@ -126,8 +126,8 @@ access the standard output. If minus can be used as the default input or default output, what happens if you open a pipe into or out of minus? What's the default command it would run? The same script as you're currently running! This is actually -a stealth C<fork> hidden inside an C<open> call. See L<perlipc/"Safe Pipe -Opens"> for details. +a stealth C<fork> hidden inside an C<open> call. See +L<perlipc/"Safe Pipe Opens"> for details. =head2 Mixing Reads and Writes @@ -309,7 +309,7 @@ C<O_DEFER>, C<O_SYNC>, C<O_ASYNC>, C<O_DSYNC>, C<O_RSYNC>, C<O_NOCTTY>, C<O_NDELAY> and C<O_LARGEFILE>. Consult your open(2) manpage or its local equivalent for details. (Note: starting from Perl release 5.6 the O_LARGEFILE flag, if available, is automatically -added to the sysopen() flags because large files are the the default.) +added to the sysopen() flags because large files are the default.) Here's how to use C<sysopen> to emulate the simple C<open> calls we had before. We'll omit the C<|| die $!> checks for clarity, but make sure @@ -684,9 +684,9 @@ also some high-level modules on CPAN that can help you with these games. Check out Term::ReadKey and Term::ReadLine. What else can you open? To open a connection using sockets, you won't use -one of Perl's two open functions. See L<perlipc/"Sockets: Client/Server -Communication"> for that. Here's an example. Once you have it, -you can use FH as a bidirectional filehandle. +one of Perl's two open functions. See +L<perlipc/"Sockets: Client/Server Communication"> for that. Here's an +example. Once you have it, you can use FH as a bidirectional filehandle. use IO::Socket; local *FH = IO::Socket::INET->new("www.perl.com:80"); diff --git a/gnu/usr.bin/perl/pod/perlport.pod b/gnu/usr.bin/perl/pod/perlport.pod index 6892b6a777f..9ae89e0799a 100644 --- a/gnu/usr.bin/perl/pod/perlport.pod +++ b/gnu/usr.bin/perl/pod/perlport.pod @@ -94,6 +94,26 @@ from) C<\015\012>, depending on whether you're reading or writing. Unix does the same thing on ttys in canonical mode. C<\015\012> is commonly referred to as CRLF. +A common cause of unportable programs is the misuse of chop() to trim +newlines: + + # XXX UNPORTABLE! + while(<FILE>) { + chop; + @array = split(/:/); + #... + } + +You can get away with this on Unix and MacOS (they have a single +character end-of-line), but the same program will break under DOSish +perls because you're only chop()ing half the end-of-line. Instead, +chomp() should be used to trim newlines. The Dunce::Files module can +help audit your code for misuses of chop(). + +When dealing with binary files (or text files in binary mode) be sure +to explicitly set $/ to the appropriate value for your file format +before using chomp(). + Because of the "text" mode translation, DOSish perls have limitations in using C<seek> and C<tell> on a file accessed in "text" mode. Stick to C<seek>-ing to locations you got from C<tell> (and no @@ -181,10 +201,12 @@ numbers to secondary storage such as a disk file or tape. Conflicting storage orders make utter mess out of the numbers. If a little-endian host (Intel, VAX) stores 0x12345678 (305419896 in -decimal), a big-endian host (Motorola, MIPS, Sparc, PA) reads it as -0x78563412 (2018915346 in decimal). To avoid this problem in network -(socket) connections use the C<pack> and C<unpack> formats C<n> -and C<N>, the "network" orders. These are guaranteed to be portable. +decimal), a big-endian host (Motorola, Sparc, PA) reads it as +0x78563412 (2018915346 in decimal). Alpha and MIPS can be either: +Digital/Compaq used/uses them in little-endian mode; SGI/Cray uses +them in big-endian mode. To avoid this problem in network (socket) +connections use the C<pack> and C<unpack> formats C<n> and C<N>, the +"network" orders. These are guaranteed to be portable. You can explore the endianness of your platform by unpacking a data structure packed in native format such as: @@ -197,7 +219,7 @@ If you need to distinguish between endian architectures you could use either of the variables set like so: $is_big_endian = unpack("h*", pack("s", 1)) =~ /01/; - $is_litte_endian = unpack("h*", pack("s", 1)) =~ /^1/; + $is_little_endian = unpack("h*", pack("s", 1)) =~ /^1/; Differing widths can cause truncation even between platforms of equal endianness. The platform of shorter width loses the upper parts of the @@ -217,7 +239,7 @@ So, it is reasonably safe to assume that all platforms support the notion of a "path" to uniquely identify a file on the system. How that path is really written, though, differs considerably. -Atlhough similar, file path specifications differ between Unix, +Although similar, file path specifications differ between Unix, Windows, S<Mac OS>, OS/2, VMS, VOS, S<RISC OS>, and probably others. Unix, for example, is one of the few OSes that has the elegant idea of a single root directory. @@ -332,7 +354,10 @@ operating systems put mandatory locks on such files. Don't count on a specific environment variable existing in C<%ENV>. Don't count on C<%ENV> entries being case-sensitive, or even -case-preserving. +case-preserving. Don't try to clear %ENV by saying C<%ENV = ();>, or, +if you really have to, make it conditional on C<$^O ne 'VMS'> since in +VMS the C<%ENV> table is much more than a per-process key-value string +table. Don't count on signals or C<%SIG> for anything. @@ -355,7 +380,7 @@ Commands that launch external processes are generally supported on most platforms (though many of them do not support any type of forking). The problem with using them arises from what you invoke them on. External tools are often named differently on different -platforms, may not be available in the same location, migth accept +platforms, may not be available in the same location, might accept different arguments, can behave differently, and often present their results in a platform-dependent way. Thus, you should seldom depend on them to produce consistent results. (Then again, if you're calling @@ -650,6 +675,15 @@ DOSish perls are as follows: Windows NT MSWin32 MSWin32-ppc Cygwin cygwin +The various MSWin32 Perl's can distinguish the OS they are running on +via the value of the fifth element of the list returned from +Win32::GetOSVersion(). For example: + + if ($^O eq 'MSWin32') { + my @os_version_info = Win32::GetOSVersion(); + print +('3.1','95','NT')[$os_version_info[4]],"\n"; + } + Also see: =over 4 @@ -681,15 +715,16 @@ The ActiveState Pages, http://www.activestate.com/ =item * The Cygwin environment for Win32; F<README.cygwin> (installed -as L<perlcygwin>), http://sourceware.cygnus.com/cygwin/ +as L<perlcygwin>), http://www.cygwin.com/ =item * The U/WIN environment for Win32, -<http://www.research.att.com/sw/tools/uwin/ +http://www.research.att.com/sw/tools/uwin/ -=item Build instructions for OS/2, L<perlos2> +=item * +Build instructions for OS/2, L<perlos2> =back @@ -888,9 +923,9 @@ vmsperl on the web, http://www.sidhe.org/vmsperl/index.html =head2 VOS -Perl on VOS is discussed in F<README.vos> in the perl distribution. -Perl on VOS can accept either VOS- or Unix-style file -specifications as in either of the following: +Perl on VOS is discussed in F<README.vos> in the perl distribution +(installed as L<perlvos>). Perl on VOS can accept either VOS- or +Unix-style file specifications as in either of the following: $ perl -ne "print if /perl_setup/i" >system>notices $ perl -ne "print if /perl_setup/i" /system/notices @@ -906,12 +941,11 @@ contain a slash character cannot be processed. Such files must be renamed before they can be processed by Perl. Note that VOS limits file names to 32 or fewer characters. -The following C functions are unimplemented on VOS, and any attempt by -Perl to use them will result in a fatal error message and an immediate -exit from Perl: dup, do_aspawn, do_spawn, fork, waitpid. Once these -functions become available in the VOS POSIX.1 implementation, you can -either recompile and rebind Perl, or you can download a newer port from -ftp.stratus.com. +See F<README.vos> for restrictions that apply when Perl is built +with the alpha version of VOS POSIX.1 support. + +Perl on VOS is built without any extensions and does not support +dynamic loading. The value of C<$^O> on VOS is "VOS". To determine the architecture that you are running on without resorting to loading all of C<%Config> you @@ -1042,7 +1076,8 @@ Also see: * -L<perlos390>, F<README.os390>, F<README.posix-bc>, F<README.vmesa> +L<perlos390>, F<README.os390>, F<perlbs2000>, F<README.vmesa>, +L<perlebcdic>. =item * @@ -1053,7 +1088,7 @@ general usage issues for all EBCDIC Perls. Send a message body of =item * AS/400 Perl information at -ttp://as400.rochester.ibm.com/ +http://as400.rochester.ibm.com/ as well as on CPAN in the F<ports/> directory. =back @@ -1200,7 +1235,7 @@ Be OS, F<README.beos> =item * HP 300 MPE/iX, F<README.mpeix> and Mark Bixby's web page -http://www.cccd.edu/~markb/perlix.html +http://www.bixby.org/mark/perlix.html =item * @@ -1208,7 +1243,7 @@ A free perl5-based PERL.NLM for Novell Netware is available in precompiled binary and source code form from http://www.novell.com/ as well as from CPAN. -=item +=item * Plan 9, F<README.plan9> @@ -1640,6 +1675,10 @@ Not implemented. (S<Mac OS>, Win32, VMS, S<RISC OS>, VOS, VM/ESA) =item stat +Platforms that do not have rdev, blksize, or blocks will return these +as '', so numeric comparison or manipulation of these fields may cause +'not numeric' warnings. + mtime and atime are the same thing, and ctime is creation time instead of inode change time. (S<Mac OS>) @@ -1650,6 +1689,9 @@ device and inode are not necessarily reliable. (VMS) mtime, atime and ctime all return the last modification time. Device and inode are not necessarily reliable. (S<RISC OS>) +dev, rdev, blksize, and blocks are not available. inode is not +meaningful and will differ between stat calls on the same file. (os2) + =item symlink OLDFILE,NEWFILE Not implemented. (Win32, VMS, S<RISC OS>) @@ -1746,7 +1788,7 @@ two seconds. (Win32) Not implemented. (S<Mac OS>, VOS) Can only be applied to process handles returned for processes spawned -using C<system(1, ...)>. (Win32) +using C<system(1, ...)> or pseudo processes created with C<fork()>. (Win32) Not useful. (S<RISC OS>) @@ -1756,6 +1798,11 @@ Not useful. (S<RISC OS>) =over 4 +=item v1.48, 02 February 2001 + +Various updates from perl5-porters over the past year, supported +platforms update from Jarkko Hietaniemi. + =item v1.47, 22 March 2000 Various cleanups from Tom Christiansen, including migration of @@ -1838,96 +1885,98 @@ First public release with perl5.005. =head1 Supported Platforms -As of early March 2000 (the Perl release 5.6.0), the following -platforms are able to build Perl from the standard source code -distribution available at http://www.perl.com/CPAN/src/index.html +As of early 2001 (the Perl release 5.6.1), the following platforms are +able to build Perl from the standard source code distribution +available at http://www.perl.com/CPAN/src/index.html AIX + AmigaOS + Darwin (Rhapsody) + DG/UX DOS DJGPP 1) + DYNIX/ptx + EPOC FreeBSD HP-UX IRIX Linux - LynxOS MachTen - MPE/iX - NetBSD + MacOS Classic 2) + NonStop-UX + ReliantUNIX (SINIX) OpenBSD + OpenVMS (VMS) OS/2 + OS X QNX - Rhapsody/Darwin 2) - SCO SV - SINIX Solaris - SVR4 - Tru64 UNIX 3) + Tru64 UNIX (DEC OSF/1, Digital UNIX) UNICOS UNICOS/mk - Unixware - VMS VOS - Windows 3.1 1) - Windows 95 1) 4) - Windows 98 1) 4) - Windows NT 1) 4) + Win32/NT/2K 3) 1) in DOS mode either the DOS or OS/2 ports can be used - 2) new in 5.6.0: the BSD/NeXT-based UNIX of Mac OS X - 3) formerly known as Digital UNIX and before that DEC OSF/1 - 4) compilers: Borland, Cygwin, Mingw32 EGCS/GCC, VC++ + 2) Mac OS Classic (pre-X) is almost 5.6.1-ready; building from + the source does work with 5.6.1, but additional MacOS specific + source code is needed for a complete build. Contact the mailing + list macperl-porters@macperl.org for more information. + 3) compilers: Borland, Cygwin, Mingw32 EGCS/GCC, VC++ -The following platforms worked for the previous major release -(5.005_03 being the latest maintenance release of that, as of early -March 2000), but be did not manage to test these in time for the 5.6.0 -release of Perl. There is a very good chance that these will work -just fine with 5.6.0. +The following platforms worked for the previous release (5.6.0), +but we did not manage to test these in time for the 5.6.1 release. +There is a very good chance that these will work fine with 5.6.1. - A/UX - BeOS - BSD/OS - DG/UX - DYNIX/ptx DomainOS Hurd - NextSTEP - OpenSTEP + LynxOS + MinGW + MPE/iX + NetBSD PowerMAX - SCO ODT/OSR + SCO SV SunOS - Ultrix + SVR4 + Unixware + Windows 3.1 + Windows 95 + Windows 98 + Windows Me -The following platform worked for the previous major release (5.005_03 -being the latest maintenance release of that, as of early March 2000). -However, standardization on UTF-8 as the internal string representation -in 5.6.0 has introduced incompatibilities in this EBCDIC platform. -Support for this platform may be enabled in a future release: +The following platform worked for the 5.005_03 major release but not +5.6.0. Standardization on UTF-8 as the internal string representation +in 5.6.0 and 5.6.1 has introduced incompatibilities in this EBCDIC +platform. While Perl 5.6.1 will build on this platform some +regression tests may fail and the C<use utf8;> pragma typically +introduces text handling errors. UTF-8 support for this platform may +be enabled in a future release: - OS390 1) + OS/390 1) - 1) Previously known as MVS, or OpenEdition MVS. + 1) previously known as MVS, about to become z/OS. -Strongly related to the OS390 platform by also being EBCDIC-based +Strongly related to the OS/390 platform by also being EBCDIC-based mainframe platforms are the following platforms: - BS2000 + POSIX-BC (BS2000) VM/ESA -These are also not expected to work under 5.6.0 for the same reasons -as OS390. Contact the mailing list perl-mvs@perl.org for more details. - -MacOS (Classic, pre-X) is almost 5.6.0-ready; building from the source -does work with 5.6.0, but additional MacOS specific source code is needed -for a complete port. Contact the mailing list macperl-porters@macperl.org -for more information. +These are also expected to work, albeit with no UTF-8 support, under 5.6.1 +for the same reasons as OS/390. Contact the mailing list perl-mvs@perl.org +for more details. The following platforms have been known to build Perl from source in -the past, but we haven't been able to verify their status for the -current release, either because the hardware/software platforms are -rare or because we don't have an active champion on these -platforms--or both: +the past (5.005_03 and earlier), but we haven't been able to verify +their status for the current release, either because the +hardware/software platforms are rare or because we don't have an +active champion on these platforms--or both. They used to work, +though, so go ahead and try compiling them, and let perlbug@perl.org +of any trouble. 3b1 - AmigaOS + A/UX + BeOS + BSD/OS ConvexOS CX/UX DC/OSx @@ -1944,16 +1993,21 @@ platforms--or both: MiNT MPC NEWS-OS + NextSTEP + OpenSTEP Opus Plan 9 PowerUX RISC/os + SCO ODT/OSR Stellar SVR2 TI1500 TitanOS + Ultrix Unisys Dynix Unixware + UTS Support for the following platform is planned for a future Perl release: @@ -1964,8 +2018,8 @@ binaries available via http://www.perl.com/CPAN/ports/index.html: Perl release - AS/400 5.003 Netware 5.003_07 + OS/400 5.005_02 Tandem Guardian 5.004 The following platforms have only binaries available via @@ -1984,8 +2038,9 @@ http://www.perl.com/CPAN/ports/index.html for binary distributions. =head1 SEE ALSO -L<perlamiga>, L<perlcygwin>, L<perldos>, L<perlhpux>, L<perlos2>, -L<perlos390>, L<perlwin32>, L<perlvms>, and L<Win32>. +L<perlaix>, L<perlamiga>, L<perlcygwin>, L<perldos>, L<perlepoc>, +L<perlebcdic>, L<perlhpux>, L<perlos2>, L<perlos390>, L<perlbs2000>, +L<perlwin32>, L<perlvms>, L<perlvos>, and L<Win32>. =head1 AUTHORS / CONTRIBUTORS @@ -2001,7 +2056,7 @@ Neale Ferguson <neale@mailbox.tabnsw.com.au>, David J. Fiander <davidf@mks.com>, Paul Green <Paul_Green@stratus.com>, M.J.T. Guy <mjtg@cus.cam.ac.uk>, -Jarkko Hietaniemi <jhi@iki.fi<gt>, +Jarkko Hietaniemi <jhi@iki.fi>, Luther Huffman <lutherh@stratcom.com>, Nick Ing-Simmons <nick@ni-s.u-net.com>, Andreas J. KE<ouml>nig <koenig@kulturbox.de>, diff --git a/gnu/usr.bin/perl/pod/perlreftut.pod b/gnu/usr.bin/perl/pod/perlreftut.pod index c8593fb1ce6..073d358da55 100644 --- a/gnu/usr.bin/perl/pod/perlreftut.pod +++ b/gnu/usr.bin/perl/pod/perlreftut.pod @@ -386,7 +386,7 @@ to do with references. =head1 Credits -Author: Mark-Jason Dominus, Plover Systems (C<mjd-perl-ref@plover.com>) +Author: Mark-Jason Dominus, Plover Systems (C<mjd-perl-ref+@plover.com>) This article originally appeared in I<The Perl Journal> (http://tpj.com) volume 3, #2. Reprinted with permission. diff --git a/gnu/usr.bin/perl/pod/perlrequick.pod b/gnu/usr.bin/perl/pod/perlrequick.pod new file mode 100644 index 00000000000..5b72a35187f --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlrequick.pod @@ -0,0 +1,503 @@ +=head1 NAME + +perlrequick - Perl regular expressions quick start + +=head1 DESCRIPTION + +This page covers the very basics of understanding, creating and +using regular expressions ('regexes') in Perl. + + +=head1 The Guide + +=head2 Simple word matching + +The simplest regex is simply a word, or more generally, a string of +characters. A regex consisting of a word matches any string that +contains that word: + + "Hello World" =~ /World/; # matches + +In this statement, C<World> is a regex and the C<//> enclosing +C</World/> tells perl to search a string for a match. The operator +C<=~> associates the string with the regex match and produces a true +value if the regex matched, or false if the regex did not match. In +our case, C<World> matches the second word in C<"Hello World">, so the +expression is true. This idea has several variations. + +Expressions like this are useful in conditionals: + + print "It matches\n" if "Hello World" =~ /World/; + +The sense of the match can be reversed by using C<!~> operator: + + print "It doesn't match\n" if "Hello World" !~ /World/; + +The literal string in the regex can be replaced by a variable: + + $greeting = "World"; + print "It matches\n" if "Hello World" =~ /$greeting/; + +If you're matching against C<$_>, the C<$_ =~> part can be omitted: + + $_ = "Hello World"; + print "It matches\n" if /World/; + +Finally, the C<//> default delimiters for a match can be changed to +arbitrary delimiters by putting an C<'m'> out front: + + "Hello World" =~ m!World!; # matches, delimited by '!' + "Hello World" =~ m{World}; # matches, note the matching '{}' + "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin', + # '/' becomes an ordinary char + +Regexes must match a part of the string I<exactly> in order for the +statement to be true: + + "Hello World" =~ /world/; # doesn't match, case sensitive + "Hello World" =~ /o W/; # matches, ' ' is an ordinary char + "Hello World" =~ /World /; # doesn't match, no ' ' at end + +perl will always match at the earliest possible point in the string: + + "Hello World" =~ /o/; # matches 'o' in 'Hello' + "That hat is red" =~ /hat/; # matches 'hat' in 'That' + +Not all characters can be used 'as is' in a match. Some characters, +called B<metacharacters>, are reserved for use in regex notation. +The metacharacters are + + {}[]()^$.|*+?\ + +A metacharacter can be matched by putting a backslash before it: + + "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter + "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary + + 'C:\WIN32' =~ /C:\\WIN/; # matches + "/usr/bin/perl" =~ /\/usr\/local\/bin\/perl/; # matches + +In the last regex, the forward slash C<'/'> is also backslashed, +because it is used to delimit the regex. + +Non-printable ASCII characters are represented by B<escape sequences>. +Common examples are C<\t> for a tab, C<\n> for a newline, and C<\r> +for a carriage return. Arbitrary bytes are represented by octal +escape sequences, e.g., C<\033>, or hexadecimal escape sequences, +e.g., C<\x1B>: + + "1000\t2000" =~ m(0\t2) # matches + "cat" =~ /\143\x61\x74/ # matches, but a weird way to spell cat + +Regexes are treated mostly as double quoted strings, so variable +substitution works: + + $foo = 'house'; + 'cathouse' =~ /cat$foo/; # matches + 'housecat' =~ /${foo}cat/; # matches + +With all of the regexes above, if the regex matched anywhere in the +string, it was considered a match. To specify I<where> it should +match, we would use the B<anchor> metacharacters C<^> and C<$>. The +anchor C<^> means match at the beginning of the string and the anchor +C<$> means match at the end of the string, or before a newline at the +end of the string. Some examples: + + "housekeeper" =~ /keeper/; # matches + "housekeeper" =~ /^keeper/; # doesn't match + "housekeeper" =~ /keeper$/; # matches + "housekeeper\n" =~ /keeper$/; # matches + "housekeeper" =~ /^housekeeper$/; # matches + +=head2 Using character classes + +A B<character class> allows a set of possible characters, rather than +just a single character, to match at a particular point in a regex. +Character classes are denoted by brackets C<[...]>, with the set of +characters to be possibly matched inside. Here are some examples: + + /cat/; # matches 'cat' + /[bcr]at/; # matches 'bat', 'cat', or 'rat' + "abc" =~ /[cab]/; # matches 'a' + +In the last statement, even though C<'c'> is the first character in +the class, the earliest point at which the regex can match is C<'a'>. + + /[yY][eE][sS]/; # match 'yes' in a case-insensitive way + # 'yes', 'Yes', 'YES', etc. + /yes/i; # also match 'yes' in a case-insensitive way + +The last example shows a match with an C<'i'> B<modifier>, which makes +the match case-insensitive. + +Character classes also have ordinary and special characters, but the +sets of ordinary and special characters inside a character class are +different than those outside a character class. The special +characters for a character class are C<-]\^$> and are matched using an +escape: + + /[\]c]def/; # matches ']def' or 'cdef' + $x = 'bcr'; + /[$x]at/; # matches 'bat, 'cat', or 'rat' + /[\$x]at/; # matches '$at' or 'xat' + /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat' + +The special character C<'-'> acts as a range operator within character +classes, so that the unwieldy C<[0123456789]> and C<[abc...xyz]> +become the svelte C<[0-9]> and C<[a-z]>: + + /item[0-9]/; # matches 'item0' or ... or 'item9' + /[0-9a-fA-F]/; # matches a hexadecimal digit + +If C<'-'> is the first or last character in a character class, it is +treated as an ordinary character. + +The special character C<^> in the first position of a character class +denotes a B<negated character class>, which matches any character but +those in the brackets. Both C<[...]> and C<[^...]> must match a +character, or the match fails. Then + + /[^a]at/; # doesn't match 'aat' or 'at', but matches + # all other 'bat', 'cat, '0at', '%at', etc. + /[^0-9]/; # matches a non-numeric character + /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary + +Perl has several abbreviations for common character classes: + +=over 4 + +=item * + +\d is a digit and represents [0-9] + +=item * + +\s is a whitespace character and represents [\ \t\r\n\f] + +=item * + +\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_] + +=item * + +\D is a negated \d; it represents any character but a digit [^0-9] + +=item * + +\S is a negated \s; it represents any non-whitespace character [^\s] + +=item * + +\W is a negated \w; it represents any non-word character [^\w] + +=item * + +The period '.' matches any character but "\n" + +=back + +The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside +of character classes. Here are some in use: + + /\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format + /[\d\s]/; # matches any digit or whitespace character + /\w\W\w/; # matches a word char, followed by a + # non-word char, followed by a word char + /..rt/; # matches any two chars, followed by 'rt' + /end\./; # matches 'end.' + /end[.]/; # same thing, matches 'end.' + +The S<B<word anchor> > C<\b> matches a boundary between a word +character and a non-word character C<\w\W> or C<\W\w>: + + $x = "Housecat catenates house and cat"; + $x =~ /\bcat/; # matches cat in 'catenates' + $x =~ /cat\b/; # matches cat in 'housecat' + $x =~ /\bcat\b/; # matches 'cat' at end of string + +In the last example, the end of the string is considered a word +boundary. + +=head2 Matching this or that + +We can match match different character strings with the B<alternation> +metacharacter C<'|'>. To match C<dog> or C<cat>, we form the regex +C<dog|cat>. As before, perl will try to match the regex at the +earliest possible point in the string. At each character position, +perl will first try to match the the first alternative, C<dog>. If +C<dog> doesn't match, perl will then try the next alternative, C<cat>. +If C<cat> doesn't match either, then the match fails and perl moves to +the next position in the string. Some examples: + + "cats and dogs" =~ /cat|dog|bird/; # matches "cat" + "cats and dogs" =~ /dog|cat|bird/; # matches "cat" + +Even though C<dog> is the first alternative in the second regex, +C<cat> is able to match earlier in the string. + + "cats" =~ /c|ca|cat|cats/; # matches "c" + "cats" =~ /cats|cat|ca|c/; # matches "cats" + +At a given character position, the first alternative that allows the +regex match to succeed wil be the one that matches. Here, all the +alternatives match at the first string position, so th first matches. + +=head2 Grouping things and hierarchical matching + +The B<grouping> metacharacters C<()> allow a part of a regex to be +treated as a single unit. Parts of a regex are grouped by enclosing +them in parentheses. The regex C<house(cat|keeper)> means match +C<house> followed by either C<cat> or C<keeper>. Some more examples +are + + /(a|b)b/; # matches 'ab' or 'bb' + /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere + + /house(cat|)/; # matches either 'housecat' or 'house' + /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or + # 'house'. Note groups can be nested. + + "20" =~ /(19|20|)\d\d/; # matches the null alternative '()\d\d', + # because '20\d\d' can't match + +=head2 Extracting matches + +The grouping metacharacters C<()> also allow the extraction of the +parts of a string that matched. For each grouping, the part that +matched inside goes into the special variables C<$1>, C<$2>, etc. +They can be used just as ordinary variables: + + # extract hours, minutes, seconds + $time =~ /(\d\d):(\d\d):(\d\d)/; # match hh:mm:ss format + $hours = $1; + $minutes = $2; + $seconds = $3; + +In list context, a match C</regex/> with groupings will return the +list of matched values C<($1,$2,...)>. So we could rewrite it as + + ($hours, $minutes, $second) = ($time =~ /(\d\d):(\d\d):(\d\d)/); + +If the groupings in a regex are nested, C<$1> gets the group with the +leftmost opening parenthesis, C<$2> the next opening parenthesis, +etc. For example, here is a complex regex and the matching variables +indicated below it: + + /(ab(cd|ef)((gi)|j))/; + 1 2 34 + +Associated with the matching variables C<$1>, C<$2>, ... are +the B<backreferences> C<\1>, C<\2>, ... Backreferences are +matching variables that can be used I<inside> a regex: + + /(\w\w\w)\s\1/; # find sequences like 'the the' in string + +C<$1>, C<$2>, ... should only be used outside of a regex, and C<\1>, +C<\2>, ... only inside a regex. + +=head2 Matching repetitions + +The B<quantifier> metacharacters C<?>, C<*>, C<+>, and C<{}> allow us +to determine the number of repeats of a portion of a regex we +consider to be a match. Quantifiers are put immediately after the +character, character class, or grouping that we want to specify. They +have the following meanings: + +=over 4 + +=item * + +C<a?> = match 'a' 1 or 0 times + +=item * + +C<a*> = match 'a' 0 or more times, i.e., any number of times + +=item * + +C<a+> = match 'a' 1 or more times, i.e., at least once + +=item * + +C<a{n,m}> = match at least C<n> times, but not more than C<m> +times. + +=item * + +C<a{n,}> = match at least C<n> or more times + +=item * + +C<a{n}> = match exactly C<n> times + +=back + +Here are some examples: + + /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and + # any number of digits + /(\w+)\s+\1/; # match doubled words of arbitrary length + $year =~ /\d{2,4}/; # make sure year is at least 2 but not more + # than 4 digits + $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates + +These quantifiers will try to match as much of the string as possible, +while still allowing the regex to match. So we have + + $x = 'the cat in the hat'; + $x =~ /^(.*)(at)(.*)$/; # matches, + # $1 = 'the cat in the h' + # $2 = 'at' + # $3 = '' (0 matches) + +The first quantifier C<.*> grabs as much of the string as possible +while still having the regex match. The second quantifier C<.*> has +no string left to it, so it matches 0 times. + +=head2 More matching + +There are a few more things you might want to know about matching +operators. In the code + + $pattern = 'Seuss'; + while (<>) { + print if /$pattern/; + } + +perl has to re-evaluate C<$pattern> each time through the loop. If +C<$pattern> won't be changing, use the C<//o> modifier, to only +perform variable substitutions once. If you don't want any +substitutions at all, use the special delimiter C<m''>: + + $pattern = 'Seuss'; + m'$pattern'; # matches '$pattern', not 'Seuss' + +The global modifier C<//g> allows the matching operator to match +within a string as many times as possible. In scalar context, +successive matches against a string will have C<//g> jump from match +to match, keeping track of position in the string as it goes along. +You can get or set the position with the C<pos()> function. +For example, + + $x = "cat dog house"; # 3 words + while ($x =~ /(\w+)/g) { + print "Word is $1, ends at position ", pos $x, "\n"; + } + +prints + + Word is cat, ends at position 3 + Word is dog, ends at position 7 + Word is house, ends at position 13 + +A failed match or changing the target string resets the position. If +you don't want the position reset after failure to match, add the +C<//c>, as in C</regex/gc>. + +In list context, C<//g> returns a list of matched groupings, or if +there are no groupings, a list of matches to the whole regex. So + + @words = ($x =~ /(\w+)/g); # matches, + # $word[0] = 'cat' + # $word[1] = 'dog' + # $word[2] = 'house' + +=head2 Search and replace + +Search and replace is performed using C<s/regex/replacement/modifiers>. +The C<replacement> is a Perl double quoted string that replaces in the +string whatever is matched with the C<regex>. The operator C<=~> is +also used here to associate a string with C<s///>. If matching +against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match, +C<s///> returns the number of substitutions made, otherwise it returns +false. Here are a few examples: + + $x = "Time to feed the cat!"; + $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!" + $y = "'quoted words'"; + $y =~ s/^'(.*)'$/$1/; # strip single quotes, + # $y contains "quoted words" + +With the C<s///> operator, the matched variables C<$1>, C<$2>, etc. +are immediately available for use in the replacement expression. With +the global modifier, C<s///g> will search and replace all occurrences +of the regex in the string: + + $x = "I batted 4 for 4"; + $x =~ s/4/four/; # $x contains "I batted four for 4" + $x = "I batted 4 for 4"; + $x =~ s/4/four/g; # $x contains "I batted four for four" + +The evaluation modifier C<s///e> wraps an C<eval{...}> around the +replacement string and the evaluated result is substituted for the +matched substring. Some examples: + + # reverse all the words in a string + $x = "the cat in the hat"; + $x =~ s/(\w+)/reverse $1/ge; # $x contains "eht tac ni eht tah" + + # convert percentage to decimal + $x = "A 39% hit rate"; + $x =~ s!(\d+)%!$1/100!e; # $x contains "A 0.39 hit rate" + +The last example shows that C<s///> can use other delimiters, such as +C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are used +C<s'''>, then the regex and replacement are treated as single quoted +strings. + +=head2 The split operator + +C<split /regex/, string> splits C<string> into a list of substrings +and returns that list. The regex determines the character sequence +that C<string> is split with respect to. For example, to split a +string into words, use + + $x = "Calvin and Hobbes"; + @word = split /\s+/, $x; # $word[0] = 'Calvin' + # $word[1] = 'and' + # $word[2] = 'Hobbes' + +To extract a comma-delimited list of numbers, use + + $x = "1.618,2.718, 3.142"; + @const = split /,\s*/, $x; # $const[0] = '1.618' + # $const[1] = '2.718' + # $const[2] = '3.142' + +If the empty regex C<//> is used, the string is split into individual +characters. If the regex has groupings, then list produced contains +the matched substrings from the groupings as well: + + $x = "/usr/bin"; + @parts = split m!(/)!, $x; # $parts[0] = '' + # $parts[1] = '/' + # $parts[2] = 'usr' + # $parts[3] = '/' + # $parts[4] = 'bin' + +Since the first character of $x matched the regex, C<split> prepended +an empty initial element to the list. + +=head1 BUGS + +None. + +=head1 SEE ALSO + +This is just a quick start guide. For a more in-depth tutorial on +regexes, see L<perlretut> and for the reference page, see L<perlre>. + +=head1 AUTHOR AND COPYRIGHT + +Copyright (c) 2000 Mark Kvale +All rights reserved. + +This document may be distributed under the same terms as Perl itself. + +=head2 Acknowledgments + +The author would like to thank Mark-Jason Dominus, Tom Christiansen, +Ilya Zakharevich, Brad Hughes, and Mike Giroux for all their helpful +comments. + +=cut + diff --git a/gnu/usr.bin/perl/pod/perlretut.pod b/gnu/usr.bin/perl/pod/perlretut.pod new file mode 100644 index 00000000000..fa6479c0c45 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlretut.pod @@ -0,0 +1,2504 @@ +=head1 NAME + +perlretut - Perl regular expressions tutorial + +=head1 DESCRIPTION + +This page provides a basic tutorial on understanding, creating and +using regular expressions in Perl. It serves as a complement to the +reference page on regular expressions L<perlre>. Regular expressions +are an integral part of the C<m//>, C<s///>, C<qr//> and C<split> +operators and so this tutorial also overlaps with +L<perlop/"Regexp Quote-Like Operators"> and L<perlfunc/split>. + +Perl is widely renowned for excellence in text processing, and regular +expressions are one of the big factors behind this fame. Perl regular +expressions display an efficiency and flexibility unknown in most +other computer languages. Mastering even the basics of regular +expressions will allow you to manipulate text with surprising ease. + +What is a regular expression? A regular expression is simply a string +that describes a pattern. Patterns are in common use these days; +examples are the patterns typed into a search engine to find web pages +and the patterns used to list files in a directory, e.g., C<ls *.txt> +or C<dir *.*>. In Perl, the patterns described by regular expressions +are used to search strings, extract desired parts of strings, and to +do search and replace operations. + +Regular expressions have the undeserved reputation of being abstract +and difficult to understand. Regular expressions are constructed using +simple concepts like conditionals and loops and are no more difficult +to understand than the corresponding C<if> conditionals and C<while> +loops in the Perl language itself. In fact, the main challenge in +learning regular expressions is just getting used to the terse +notation used to express these concepts. + +This tutorial flattens the learning curve by discussing regular +expression concepts, along with their notation, one at a time and with +many examples. The first part of the tutorial will progress from the +simplest word searches to the basic regular expression concepts. If +you master the first part, you will have all the tools needed to solve +about 98% of your needs. The second part of the tutorial is for those +comfortable with the basics and hungry for more power tools. It +discusses the more advanced regular expression operators and +introduces the latest cutting edge innovations in 5.6.0. + +A note: to save time, 'regular expression' is often abbreviated as +regexp or regex. Regexp is a more natural abbreviation than regex, but +is harder to pronounce. The Perl pod documentation is evenly split on +regexp vs regex; in Perl, there is more than one way to abbreviate it. +We'll use regexp in this tutorial. + +=head1 Part 1: The basics + +=head2 Simple word matching + +The simplest regexp is simply a word, or more generally, a string of +characters. A regexp consisting of a word matches any string that +contains that word: + + "Hello World" =~ /World/; # matches + +What is this perl statement all about? C<"Hello World"> is a simple +double quoted string. C<World> is the regular expression and the +C<//> enclosing C</World/> tells perl to search a string for a match. +The operator C<=~> associates the string with the regexp match and +produces a true value if the regexp matched, or false if the regexp +did not match. In our case, C<World> matches the second word in +C<"Hello World">, so the expression is true. Expressions like this +are useful in conditionals: + + if ("Hello World" =~ /World/) { + print "It matches\n"; + } + else { + print "It doesn't match\n"; + } + +There are useful variations on this theme. The sense of the match can +be reversed by using C<!~> operator: + + if ("Hello World" !~ /World/) { + print "It doesn't match\n"; + } + else { + print "It matches\n"; + } + +The literal string in the regexp can be replaced by a variable: + + $greeting = "World"; + if ("Hello World" =~ /$greeting/) { + print "It matches\n"; + } + else { + print "It doesn't match\n"; + } + +If you're matching against the special default variable C<$_>, the +C<$_ =~> part can be omitted: + + $_ = "Hello World"; + if (/World/) { + print "It matches\n"; + } + else { + print "It doesn't match\n"; + } + +And finally, the C<//> default delimiters for a match can be changed +to arbitrary delimiters by putting an C<'m'> out front: + + "Hello World" =~ m!World!; # matches, delimited by '!' + "Hello World" =~ m{World}; # matches, note the matching '{}' + "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin', + # '/' becomes an ordinary char + +C</World/>, C<m!World!>, and C<m{World}> all represent the +same thing. When, e.g., C<""> is used as a delimiter, the forward +slash C<'/'> becomes an ordinary character and can be used in a regexp +without trouble. + +Let's consider how different regexps would match C<"Hello World">: + + "Hello World" =~ /world/; # doesn't match + "Hello World" =~ /o W/; # matches + "Hello World" =~ /oW/; # doesn't match + "Hello World" =~ /World /; # doesn't match + +The first regexp C<world> doesn't match because regexps are +case-sensitive. The second regexp matches because the substring +S<C<'o W'> > occurs in the string S<C<"Hello World"> >. The space +character ' ' is treated like any other character in a regexp and is +needed to match in this case. The lack of a space character is the +reason the third regexp C<'oW'> doesn't match. The fourth regexp +C<'World '> doesn't match because there is a space at the end of the +regexp, but not at the end of the string. The lesson here is that +regexps must match a part of the string I<exactly> in order for the +statement to be true. + +If a regexp matches in more than one place in the string, perl will +always match at the earliest possible point in the string: + + "Hello World" =~ /o/; # matches 'o' in 'Hello' + "That hat is red" =~ /hat/; # matches 'hat' in 'That' + +With respect to character matching, there are a few more points you +need to know about. First of all, not all characters can be used 'as +is' in a match. Some characters, called B<metacharacters>, are reserved +for use in regexp notation. The metacharacters are + + {}[]()^$.|*+?\ + +The significance of each of these will be explained +in the rest of the tutorial, but for now, it is important only to know +that a metacharacter can be matched by putting a backslash before it: + + "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter + "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary + + "The interval is [0,1)." =~ /[0,1)./ # is a syntax error! + "The interval is [0,1)." =~ /\[0,1\)\./ # matches + "/usr/bin/perl" =~ /\/usr\/local\/bin\/perl/; # matches + +In the last regexp, the forward slash C<'/'> is also backslashed, +because it is used to delimit the regexp. This can lead to LTS +(leaning toothpick syndrome), however, and it is often more readable +to change delimiters. + + +The backslash character C<'\'> is a metacharacter itself and needs to +be backslashed: + + 'C:\WIN32' =~ /C:\\WIN/; # matches + +In addition to the metacharacters, there are some ASCII characters +which don't have printable character equivalents and are instead +represented by B<escape sequences>. Common examples are C<\t> for a +tab, C<\n> for a newline, C<\r> for a carriage return and C<\a> for a +bell. If your string is better thought of as a sequence of arbitrary +bytes, the octal escape sequence, e.g., C<\033>, or hexadecimal escape +sequence, e.g., C<\x1B> may be a more natural representation for your +bytes. Here are some examples of escapes: + + "1000\t2000" =~ m(0\t2) # matches + "1000\n2000" =~ /0\n20/ # matches + "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000" + "cat" =~ /\143\x61\x74/ # matches, but a weird way to spell cat + +If you've been around Perl a while, all this talk of escape sequences +may seem familiar. Similar escape sequences are used in double-quoted +strings and in fact the regexps in Perl are mostly treated as +double-quoted strings. This means that variables can be used in +regexps as well. Just like double-quoted strings, the values of the +variables in the regexp will be substituted in before the regexp is +evaluated for matching purposes. So we have: + + $foo = 'house'; + 'housecat' =~ /$foo/; # matches + 'cathouse' =~ /cat$foo/; # matches + 'housecat' =~ /${foo}cat/; # matches + +So far, so good. With the knowledge above you can already perform +searches with just about any literal string regexp you can dream up. +Here is a I<very simple> emulation of the Unix grep program: + + % cat > simple_grep + #!/usr/bin/perl + $regexp = shift; + while (<>) { + print if /$regexp/; + } + ^D + + % chmod +x simple_grep + + % simple_grep abba /usr/dict/words + Babbage + cabbage + cabbages + sabbath + Sabbathize + Sabbathizes + sabbatical + scabbard + scabbards + +This program is easy to understand. C<#!/usr/bin/perl> is the standard +way to invoke a perl program from the shell. +S<C<$regexp = shift;> > saves the first command line argument as the +regexp to be used, leaving the rest of the command line arguments to +be treated as files. S<C<< while (<>) >> > loops over all the lines in +all the files. For each line, S<C<print if /$regexp/;> > prints the +line if the regexp matches the line. In this line, both C<print> and +C</$regexp/> use the default variable C<$_> implicitly. + +With all of the regexps above, if the regexp matched anywhere in the +string, it was considered a match. Sometimes, however, we'd like to +specify I<where> in the string the regexp should try to match. To do +this, we would use the B<anchor> metacharacters C<^> and C<$>. The +anchor C<^> means match at the beginning of the string and the anchor +C<$> means match at the end of the string, or before a newline at the +end of the string. Here is how they are used: + + "housekeeper" =~ /keeper/; # matches + "housekeeper" =~ /^keeper/; # doesn't match + "housekeeper" =~ /keeper$/; # matches + "housekeeper\n" =~ /keeper$/; # matches + +The second regexp doesn't match because C<^> constrains C<keeper> to +match only at the beginning of the string, but C<"housekeeper"> has +keeper starting in the middle. The third regexp does match, since the +C<$> constrains C<keeper> to match only at the end of the string. + +When both C<^> and C<$> are used at the same time, the regexp has to +match both the beginning and the end of the string, i.e., the regexp +matches the whole string. Consider + + "keeper" =~ /^keep$/; # doesn't match + "keeper" =~ /^keeper$/; # matches + "" =~ /^$/; # ^$ matches an empty string + +The first regexp doesn't match because the string has more to it than +C<keep>. Since the second regexp is exactly the string, it +matches. Using both C<^> and C<$> in a regexp forces the complete +string to match, so it gives you complete control over which strings +match and which don't. Suppose you are looking for a fellow named +bert, off in a string by himself: + + "dogbert" =~ /bert/; # matches, but not what you want + + "dilbert" =~ /^bert/; # doesn't match, but .. + "bertram" =~ /^bert/; # matches, so still not good enough + + "bertram" =~ /^bert$/; # doesn't match, good + "dilbert" =~ /^bert$/; # doesn't match, good + "bert" =~ /^bert$/; # matches, perfect + +Of course, in the case of a literal string, one could just as easily +use the string equivalence S<C<$string eq 'bert'> > and it would be +more efficient. The C<^...$> regexp really becomes useful when we +add in the more powerful regexp tools below. + +=head2 Using character classes + +Although one can already do quite a lot with the literal string +regexps above, we've only scratched the surface of regular expression +technology. In this and subsequent sections we will introduce regexp +concepts (and associated metacharacter notations) that will allow a +regexp to not just represent a single character sequence, but a I<whole +class> of them. + +One such concept is that of a B<character class>. A character class +allows a set of possible characters, rather than just a single +character, to match at a particular point in a regexp. Character +classes are denoted by brackets C<[...]>, with the set of characters +to be possibly matched inside. Here are some examples: + + /cat/; # matches 'cat' + /[bcr]at/; # matches 'bat, 'cat', or 'rat' + /item[0123456789]/; # matches 'item0' or ... or 'item9' + "abc" =~ /[cab]/; # matches 'a' + +In the last statement, even though C<'c'> is the first character in +the class, C<'a'> matches because the first character position in the +string is the earliest point at which the regexp can match. + + /[yY][eE][sS]/; # match 'yes' in a case-insensitive way + # 'yes', 'Yes', 'YES', etc. + +This regexp displays a common task: perform a a case-insensitive +match. Perl provides away of avoiding all those brackets by simply +appending an C<'i'> to the end of the match. Then C</[yY][eE][sS]/;> +can be rewritten as C</yes/i;>. The C<'i'> stands for +case-insensitive and is an example of a B<modifier> of the matching +operation. We will meet other modifiers later in the tutorial. + +We saw in the section above that there were ordinary characters, which +represented themselves, and special characters, which needed a +backslash C<\> to represent themselves. The same is true in a +character class, but the sets of ordinary and special characters +inside a character class are different than those outside a character +class. The special characters for a character class are C<-]\^$>. C<]> +is special because it denotes the end of a character class. C<$> is +special because it denotes a scalar variable. C<\> is special because +it is used in escape sequences, just like above. Here is how the +special characters C<]$\> are handled: + + /[\]c]def/; # matches ']def' or 'cdef' + $x = 'bcr'; + /[$x]at/; # matches 'bat', 'cat', or 'rat' + /[\$x]at/; # matches '$at' or 'xat' + /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat' + +The last two are a little tricky. in C<[\$x]>, the backslash protects +the dollar sign, so the character class has two members C<$> and C<x>. +In C<[\\$x]>, the backslash is protected, so C<$x> is treated as a +variable and substituted in double quote fashion. + +The special character C<'-'> acts as a range operator within character +classes, so that a contiguous set of characters can be written as a +range. With ranges, the unwieldy C<[0123456789]> and C<[abc...xyz]> +become the svelte C<[0-9]> and C<[a-z]>. Some examples are + + /item[0-9]/; # matches 'item0' or ... or 'item9' + /[0-9bx-z]aa/; # matches '0aa', ..., '9aa', + # 'baa', 'xaa', 'yaa', or 'zaa' + /[0-9a-fA-F]/; # matches a hexadecimal digit + /[0-9a-zA-Z_]/; # matches a "word" character, + # like those in a perl variable name + +If C<'-'> is the first or last character in a character class, it is +treated as an ordinary character; C<[-ab]>, C<[ab-]> and C<[a\-b]> are +all equivalent. + +The special character C<^> in the first position of a character class +denotes a B<negated character class>, which matches any character but +those in the brackets. Both C<[...]> and C<[^...]> must match a +character, or the match fails. Then + + /[^a]at/; # doesn't match 'aat' or 'at', but matches + # all other 'bat', 'cat, '0at', '%at', etc. + /[^0-9]/; # matches a non-numeric character + /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary + +Now, even C<[0-9]> can be a bother the write multiple times, so in the +interest of saving keystrokes and making regexps more readable, Perl +has several abbreviations for common character classes: + +=over 4 + +=item * + +\d is a digit and represents [0-9] + +=item * + +\s is a whitespace character and represents [\ \t\r\n\f] + +=item * + +\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_] + +=item * + +\D is a negated \d; it represents any character but a digit [^0-9] + +=item * + +\S is a negated \s; it represents any non-whitespace character [^\s] + +=item * + +\W is a negated \w; it represents any non-word character [^\w] + +=item * + +The period '.' matches any character but "\n" + +=back + +The C<\d\s\w\D\S\W> abbreviations can be used both inside and outside +of character classes. Here are some in use: + + /\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format + /[\d\s]/; # matches any digit or whitespace character + /\w\W\w/; # matches a word char, followed by a + # non-word char, followed by a word char + /..rt/; # matches any two chars, followed by 'rt' + /end\./; # matches 'end.' + /end[.]/; # same thing, matches 'end.' + +Because a period is a metacharacter, it needs to be escaped to match +as an ordinary period. Because, for example, C<\d> and C<\w> are sets +of characters, it is incorrect to think of C<[^\d\w]> as C<[\D\W]>; in +fact C<[^\d\w]> is the same as C<[^\w]>, which is the same as +C<[\W]>. Think DeMorgan's laws. + +An anchor useful in basic regexps is the S<B<word anchor> > +C<\b>. This matches a boundary between a word character and a non-word +character C<\w\W> or C<\W\w>: + + $x = "Housecat catenates house and cat"; + $x =~ /cat/; # matches cat in 'housecat' + $x =~ /\bcat/; # matches cat in 'catenates' + $x =~ /cat\b/; # matches cat in 'housecat' + $x =~ /\bcat\b/; # matches 'cat' at end of string + +Note in the last example, the end of the string is considered a word +boundary. + +You might wonder why C<'.'> matches everything but C<"\n"> - why not +every character? The reason is that often one is matching against +lines and would like to ignore the newline characters. For instance, +while the string C<"\n"> represents one line, we would like to think +of as empty. Then + + "" =~ /^$/; # matches + "\n" =~ /^$/; # matches, "\n" is ignored + + "" =~ /./; # doesn't match; it needs a char + "" =~ /^.$/; # doesn't match; it needs a char + "\n" =~ /^.$/; # doesn't match; it needs a char other than "\n" + "a" =~ /^.$/; # matches + "a\n" =~ /^.$/; # matches, ignores the "\n" + +This behavior is convenient, because we usually want to ignore +newlines when we count and match characters in a line. Sometimes, +however, we want to keep track of newlines. We might even want C<^> +and C<$> to anchor at the beginning and end of lines within the +string, rather than just the beginning and end of the string. Perl +allows us to choose between ignoring and paying attention to newlines +by using the C<//s> and C<//m> modifiers. C<//s> and C<//m> stand for +single line and multi-line and they determine whether a string is to +be treated as one continuous string, or as a set of lines. The two +modifiers affect two aspects of how the regexp is interpreted: 1) how +the C<'.'> character class is defined, and 2) where the anchors C<^> +and C<$> are able to match. Here are the four possible combinations: + +=over 4 + +=item * + +no modifiers (//): Default behavior. C<'.'> matches any character +except C<"\n">. C<^> matches only at the beginning of the string and +C<$> matches only at the end or before a newline at the end. + +=item * + +s modifier (//s): Treat string as a single long line. C<'.'> matches +any character, even C<"\n">. C<^> matches only at the beginning of +the string and C<$> matches only at the end or before a newline at the +end. + +=item * + +m modifier (//m): Treat string as a set of multiple lines. C<'.'> +matches any character except C<"\n">. C<^> and C<$> are able to match +at the start or end of I<any> line within the string. + +=item * + +both s and m modifiers (//sm): Treat string as a single long line, but +detect multiple lines. C<'.'> matches any character, even +C<"\n">. C<^> and C<$>, however, are able to match at the start or end +of I<any> line within the string. + +=back + +Here are examples of C<//s> and C<//m> in action: + + $x = "There once was a girl\nWho programmed in Perl\n"; + + $x =~ /^Who/; # doesn't match, "Who" not at start of string + $x =~ /^Who/s; # doesn't match, "Who" not at start of string + $x =~ /^Who/m; # matches, "Who" at start of second line + $x =~ /^Who/sm; # matches, "Who" at start of second line + + $x =~ /girl.Who/; # doesn't match, "." doesn't match "\n" + $x =~ /girl.Who/s; # matches, "." matches "\n" + $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\n" + $x =~ /girl.Who/sm; # matches, "." matches "\n" + +Most of the time, the default behavior is what is want, but C<//s> and +C<//m> are occasionally very useful. If C<//m> is being used, the start +of the string can still be matched with C<\A> and the end of string +can still be matched with the anchors C<\Z> (matches both the end and +the newline before, like C<$>), and C<\z> (matches only the end): + + $x =~ /^Who/m; # matches, "Who" at start of second line + $x =~ /\AWho/m; # doesn't match, "Who" is not at start of string + + $x =~ /girl$/m; # matches, "girl" at end of first line + $x =~ /girl\Z/m; # doesn't match, "girl" is not at end of string + + $x =~ /Perl\Z/m; # matches, "Perl" is at newline before end + $x =~ /Perl\z/m; # doesn't match, "Perl" is not at end of string + +We now know how to create choices among classes of characters in a +regexp. What about choices among words or character strings? Such +choices are described in the next section. + +=head2 Matching this or that + +Sometimes we would like to our regexp to be able to match different +possible words or character strings. This is accomplished by using +the B<alternation> metacharacter C<|>. To match C<dog> or C<cat>, we +form the regexp C<dog|cat>. As before, perl will try to match the +regexp at the earliest possible point in the string. At each +character position, perl will first try to match the first +alternative, C<dog>. If C<dog> doesn't match, perl will then try the +next alternative, C<cat>. If C<cat> doesn't match either, then the +match fails and perl moves to the next position in the string. Some +examples: + + "cats and dogs" =~ /cat|dog|bird/; # matches "cat" + "cats and dogs" =~ /dog|cat|bird/; # matches "cat" + +Even though C<dog> is the first alternative in the second regexp, +C<cat> is able to match earlier in the string. + + "cats" =~ /c|ca|cat|cats/; # matches "c" + "cats" =~ /cats|cat|ca|c/; # matches "cats" + +Here, all the alternatives match at the first string position, so the +first alternative is the one that matches. If some of the +alternatives are truncations of the others, put the longest ones first +to give them a chance to match. + + "cab" =~ /a|b|c/ # matches "c" + # /a|b|c/ == /[abc]/ + +The last example points out that character classes are like +alternations of characters. At a given character position, the first +alternative that allows the regexp match to succeed wil be the one +that matches. + +=head2 Grouping things and hierarchical matching + +Alternation allows a regexp to choose among alternatives, but by +itself it unsatisfying. The reason is that each alternative is a whole +regexp, but sometime we want alternatives for just part of a +regexp. For instance, suppose we want to search for housecats or +housekeepers. The regexp C<housecat|housekeeper> fits the bill, but is +inefficient because we had to type C<house> twice. It would be nice to +have parts of the regexp be constant, like C<house>, and and some +parts have alternatives, like C<cat|keeper>. + +The B<grouping> metacharacters C<()> solve this problem. Grouping +allows parts of a regexp to be treated as a single unit. Parts of a +regexp are grouped by enclosing them in parentheses. Thus we could solve +the C<housecat|housekeeper> by forming the regexp as +C<house(cat|keeper)>. The regexp C<house(cat|keeper)> means match +C<house> followed by either C<cat> or C<keeper>. Some more examples +are + + /(a|b)b/; # matches 'ab' or 'bb' + /(ac|b)b/; # matches 'acb' or 'bb' + /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere + /(a|[bc])d/; # matches 'ad', 'bd', or 'cd' + + /house(cat|)/; # matches either 'housecat' or 'house' + /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or + # 'house'. Note groups can be nested. + + /(19|20|)\d\d/; # match years 19xx, 20xx, or the Y2K problem, xx + "20" =~ /(19|20|)\d\d/; # matches the null alternative '()\d\d', + # because '20\d\d' can't match + +Alternations behave the same way in groups as out of them: at a given +string position, the leftmost alternative that allows the regexp to +match is taken. So in the last example at tth first string position, +C<"20"> matches the second alternative, but there is nothing left over +to match the next two digits C<\d\d>. So perl moves on to the next +alternative, which is the null alternative and that works, since +C<"20"> is two digits. + +The process of trying one alternative, seeing if it matches, and +moving on to the next alternative if it doesn't, is called +B<backtracking>. The term 'backtracking' comes from the idea that +matching a regexp is like a walk in the woods. Successfully matching +a regexp is like arriving at a destination. There are many possible +trailheads, one for each string position, and each one is tried in +order, left to right. From each trailhead there may be many paths, +some of which get you there, and some which are dead ends. When you +walk along a trail and hit a dead end, you have to backtrack along the +trail to an earlier point to try another trail. If you hit your +destination, you stop immediately and forget about trying all the +other trails. You are persistent, and only if you have tried all the +trails from all the trailheads and not arrived at your destination, do +you declare failure. To be concrete, here is a step-by-step analysis +of what perl does when it tries to match the regexp + + "abcde" =~ /(abd|abc)(df|d|de)/; + +=over 4 + +=item 0 + +Start with the first letter in the string 'a'. + +=item 1 + +Try the first alternative in the first group 'abd'. + +=item 2 + +Match 'a' followed by 'b'. So far so good. + +=item 3 + +'d' in the regexp doesn't match 'c' in the string - a dead +end. So backtrack two characters and pick the second alternative in +the first group 'abc'. + +=item 4 + +Match 'a' followed by 'b' followed by 'c'. We are on a roll +and have satisfied the first group. Set $1 to 'abc'. + +=item 5 + +Move on to the second group and pick the first alternative +'df'. + +=item 6 + +Match the 'd'. + +=item 7 + +'f' in the regexp doesn't match 'e' in the string, so a dead +end. Backtrack one character and pick the second alternative in the +second group 'd'. + +=item 8 + +'d' matches. The second grouping is satisfied, so set $2 to +'d'. + +=item 9 + +We are at the end of the regexp, so we are done! We have +matched 'abcd' out of the string "abcde". + +=back + +There are a couple of things to note about this analysis. First, the +third alternative in the second group 'de' also allows a match, but we +stopped before we got to it - at a given character position, leftmost +wins. Second, we were able to get a match at the first character +position of the string 'a'. If there were no matches at the first +position, perl would move to the second character position 'b' and +attempt the match all over again. Only when all possible paths at all +possible character positions have been exhausted does perl give give +up and declare S<C<$string =~ /(abd|abc)(df|d|de)/;> > to be false. + +Even with all this work, regexp matching happens remarkably fast. To +speed things up, during compilation stage, perl compiles the regexp +into a compact sequence of opcodes that can often fit inside a +processor cache. When the code is executed, these opcodes can then run +at full throttle and search very quickly. + +=head2 Extracting matches + +The grouping metacharacters C<()> also serve another completely +different function: they allow the extraction of the parts of a string +that matched. This is very useful to find out what matched and for +text processing in general. For each grouping, the part that matched +inside goes into the special variables C<$1>, C<$2>, etc. They can be +used just as ordinary variables: + + # extract hours, minutes, seconds + $time =~ /(\d\d):(\d\d):(\d\d)/; # match hh:mm:ss format + $hours = $1; + $minutes = $2; + $seconds = $3; + +Now, we know that in scalar context, +S<C<$time =~ /(\d\d):(\d\d):(\d\d)/> > returns a true or false +value. In list context, however, it returns the list of matched values +C<($1,$2,$3)>. So we could write the code more compactly as + + # extract hours, minutes, seconds + ($hours, $minutes, $second) = ($time =~ /(\d\d):(\d\d):(\d\d)/); + +If the groupings in a regexp are nested, C<$1> gets the group with the +leftmost opening parenthesis, C<$2> the next opening parenthesis, +etc. For example, here is a complex regexp and the matching variables +indicated below it: + + /(ab(cd|ef)((gi)|j))/; + 1 2 34 + +so that if the regexp matched, e.g., C<$2> would contain 'cd' or 'ef'. +For convenience, perl sets C<$+> to the highest numbered C<$1>, C<$2>, +... that got assigned. + +Closely associated with the matching variables C<$1>, C<$2>, ... are +the B<backreferences> C<\1>, C<\2>, ... . Backreferences are simply +matching variables that can be used I<inside> a regexp. This is a +really nice feature - what matches later in a regexp can depend on +what matched earlier in the regexp. Suppose we wanted to look +for doubled words in text, like 'the the'. The following regexp finds +all 3-letter doubles with a space in between: + + /(\w\w\w)\s\1/; + +The grouping assigns a value to \1, so that the same 3 letter sequence +is used for both parts. Here are some words with repeated parts: + + % simple_grep '^(\w\w\w\w|\w\w\w|\w\w|\w)\1$' /usr/dict/words + beriberi + booboo + coco + mama + murmur + papa + +The regexp has a single grouping which considers 4-letter +combinations, then 3-letter combinations, etc. and uses C<\1> to look for +a repeat. Although C<$1> and C<\1> represent the same thing, care should be +taken to use matched variables C<$1>, C<$2>, ... only outside a regexp +and backreferences C<\1>, C<\2>, ... only inside a regexp; not doing +so may lead to surprising and/or undefined results. + +In addition to what was matched, Perl 5.6.0 also provides the +positions of what was matched with the C<@-> and C<@+> +arrays. C<$-[0]> is the position of the start of the entire match and +C<$+[0]> is the position of the end. Similarly, C<$-[n]> is the +position of the start of the C<$n> match and C<$+[n]> is the position +of the end. If C<$n> is undefined, so are C<$-[n]> and C<$+[n]>. Then +this code + + $x = "Mmm...donut, thought Homer"; + $x =~ /^(Mmm|Yech)\.\.\.(donut|peas)/; # matches + foreach $expr (1..$#-) { + print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\n"; + } + +prints + + Match 1: 'Mmm' at position (0,3) + Match 2: 'donut' at position (6,11) + +Even if there are no groupings in a regexp, it is still possible to +find out what exactly matched in a string. If you use them, perl +will set C<$`> to the part of the string before the match, will set C<$&> +to the part of the string that matched, and will set C<$'> to the part +of the string after the match. An example: + + $x = "the cat caught the mouse"; + $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse' + $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse' + +In the second match, S<C<$` = ''> > because the regexp matched at the +first character position in the string and stopped, it never saw the +second 'the'. It is important to note that using C<$`> and C<$'> +slows down regexp matching quite a bit, and C< $& > slows it down to a +lesser extent, because if they are used in one regexp in a program, +they are generated for <all> regexps in the program. So if raw +performance is a goal of your application, they should be avoided. +If you need them, use C<@-> and C<@+> instead: + + $` is the same as substr( $x, 0, $-[0] ) + $& is the same as substr( $x, $-[0], $+[0]-$-[0] ) + $' is the same as substr( $x, $+[0] ) + +=head2 Matching repetitions + +The examples in the previous section display an annoying weakness. We +were only matching 3-letter words, or syllables of 4 letters or +less. We'd like to be able to match words or syllables of any length, +without writing out tedious alternatives like +C<\w\w\w\w|\w\w\w|\w\w|\w>. + +This is exactly the problem the B<quantifier> metacharacters C<?>, +C<*>, C<+>, and C<{}> were created for. They allow us to determine the +number of repeats of a portion of a regexp we consider to be a +match. Quantifiers are put immediately after the character, character +class, or grouping that we want to specify. They have the following +meanings: + +=over 4 + +=item * + +C<a?> = match 'a' 1 or 0 times + +=item * + +C<a*> = match 'a' 0 or more times, i.e., any number of times + +=item * + +C<a+> = match 'a' 1 or more times, i.e., at least once + +=item * + +C<a{n,m}> = match at least C<n> times, but not more than C<m> +times. + +=item * + +C<a{n,}> = match at least C<n> or more times + +=item * + +C<a{n}> = match exactly C<n> times + +=back + +Here are some examples: + + /[a-z]+\s+\d*/; # match a lowercase word, at least some space, and + # any number of digits + /(\w+)\s+\1/; # match doubled words of arbitrary length + /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes' + $year =~ /\d{2,4}/; # make sure year is at least 2 but not more + # than 4 digits + $year =~ /\d{4}|\d{2}/; # better match; throw out 3 digit dates + $year =~ /\d{2}(\d{2})?/; # same thing written differently. However, + # this produces $1 and the other does not. + + % simple_grep '^(\w+)\1$' /usr/dict/words # isn't this easier? + beriberi + booboo + coco + mama + murmur + papa + +For all of these quantifiers, perl will try to match as much of the +string as possible, while still allowing the regexp to succeed. Thus +with C</a?.../>, perl will first try to match the regexp with the C<a> +present; if that fails, perl will try to match the regexp without the +C<a> present. For the quantifier C<*>, we get the following: + + $x = "the cat in the hat"; + $x =~ /^(.*)(cat)(.*)$/; # matches, + # $1 = 'the ' + # $2 = 'cat' + # $3 = ' in the hat' + +Which is what we might expect, the match finds the only C<cat> in the +string and locks onto it. Consider, however, this regexp: + + $x =~ /^(.*)(at)(.*)$/; # matches, + # $1 = 'the cat in the h' + # $2 = 'at' + # $3 = '' (0 matches) + +One might initially guess that perl would find the C<at> in C<cat> and +stop there, but that wouldn't give the longest possible string to the +first quantifier C<.*>. Instead, the first quantifier C<.*> grabs as +much of the string as possible while still having the regexp match. In +this example, that means having the C<at> sequence with the final C<at> +in the string. The other important principle illustrated here is that +when there are two or more elements in a regexp, the I<leftmost> +quantifier, if there is one, gets to grab as much the string as +possible, leaving the rest of the regexp to fight over scraps. Thus in +our example, the first quantifier C<.*> grabs most of the string, while +the second quantifier C<.*> gets the empty string. Quantifiers that +grab as much of the string as possible are called B<maximal match> or +B<greedy> quantifiers. + +When a regexp can match a string in several different ways, we can use +the principles above to predict which way the regexp will match: + +=over 4 + +=item * + +Principle 0: Taken as a whole, any regexp will be matched at the +earliest possible position in the string. + +=item * + +Principle 1: In an alternation C<a|b|c...>, the leftmost alternative +that allows a match for the whole regexp will be the one used. + +=item * + +Principle 2: The maximal matching quantifiers C<?>, C<*>, C<+> and +C<{n,m}> will in general match as much of the string as possible while +still allowing the whole regexp to match. + +=item * + +Principle 3: If there are two or more elements in a regexp, the +leftmost greedy quantifier, if any, will match as much of the string +as possible while still allowing the whole regexp to match. The next +leftmost greedy quantifier, if any, will try to match as much of the +string remaining available to it as possible, while still allowing the +whole regexp to match. And so on, until all the regexp elements are +satisfied. + +=back + +As we have seen above, Principle 0 overrides the others - the regexp +will be matched as early as possible, with the other principles +determining how the regexp matches at that earliest character +position. + +Here is an example of these principles in action: + + $x = "The programming republic of Perl"; + $x =~ /^(.+)(e|r)(.*)$/; # matches, + # $1 = 'The programming republic of Pe' + # $2 = 'r' + # $3 = 'l' + +This regexp matches at the earliest string position, C<'T'>. One +might think that C<e>, being leftmost in the alternation, would be +matched, but C<r> produces the longest string in the first quantifier. + + $x =~ /(m{1,2})(.*)$/; # matches, + # $1 = 'mm' + # $2 = 'ing republic of Perl' + +Here, The earliest possible match is at the first C<'m'> in +C<programming>. C<m{1,2}> is the first quantifier, so it gets to match +a maximal C<mm>. + + $x =~ /.*(m{1,2})(.*)$/; # matches, + # $1 = 'm' + # $2 = 'ing republic of Perl' + +Here, the regexp matches at the start of the string. The first +quantifier C<.*> grabs as much as possible, leaving just a single +C<'m'> for the second quantifier C<m{1,2}>. + + $x =~ /(.?)(m{1,2})(.*)$/; # matches, + # $1 = 'a' + # $2 = 'mm' + # $3 = 'ing republic of Perl' + +Here, C<.?> eats its maximal one character at the earliest possible +position in the string, C<'a'> in C<programming>, leaving C<m{1,2}> +the opportunity to match both C<m>'s. Finally, + + "aXXXb" =~ /(X*)/; # matches with $1 = '' + +because it can match zero copies of C<'X'> at the beginning of the +string. If you definitely want to match at least one C<'X'>, use +C<X+>, not C<X*>. + +Sometimes greed is not good. At times, we would like quantifiers to +match a I<minimal> piece of string, rather than a maximal piece. For +this purpose, Larry Wall created the S<B<minimal match> > or +B<non-greedy> quantifiers C<??>,C<*?>, C<+?>, and C<{}?>. These are +the usual quantifiers with a C<?> appended to them. They have the +following meanings: + +=over 4 + +=item * + +C<a??> = match 'a' 0 or 1 times. Try 0 first, then 1. + +=item * + +C<a*?> = match 'a' 0 or more times, i.e., any number of times, +but as few times as possible + +=item * + +C<a+?> = match 'a' 1 or more times, i.e., at least once, but +as few times as possible + +=item * + +C<a{n,m}?> = match at least C<n> times, not more than C<m> +times, as few times as possible + +=item * + +C<a{n,}?> = match at least C<n> times, but as few times as +possible + +=item * + +C<a{n}?> = match exactly C<n> times. Because we match exactly +C<n> times, C<a{n}?> is equivalent to C<a{n}> and is just there for +notational consistency. + +=back + +Let's look at the example above, but with minimal quantifiers: + + $x = "The programming republic of Perl"; + $x =~ /^(.+?)(e|r)(.*)$/; # matches, + # $1 = 'Th' + # $2 = 'e' + # $3 = ' programming republic of Perl' + +The minimal string that will allow both the start of the string C<^> +and the alternation to match is C<Th>, with the alternation C<e|r> +matching C<e>. The second quantifier C<.*> is free to gobble up the +rest of the string. + + $x =~ /(m{1,2}?)(.*?)$/; # matches, + # $1 = 'm' + # $2 = 'ming republic of Perl' + +The first string position that this regexp can match is at the first +C<'m'> in C<programming>. At this position, the minimal C<m{1,2}?> +matches just one C<'m'>. Although the second quantifier C<.*?> would +prefer to match no characters, it is constrained by the end-of-string +anchor C<$> to match the rest of the string. + + $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches, + # $1 = 'The progra' + # $2 = 'm' + # $3 = 'ming republic of Perl' + +In this regexp, you might expect the first minimal quantifier C<.*?> +to match the empty string, because it is not constrained by a C<^> +anchor to match the beginning of the word. Principle 0 applies here, +however. Because it is possible for the whole regexp to match at the +start of the string, it I<will> match at the start of the string. Thus +the first quantifier has to match everything up to the first C<m>. The +second minimal quantifier matches just one C<m> and the third +quantifier matches the rest of the string. + + $x =~ /(.??)(m{1,2})(.*)$/; # matches, + # $1 = 'a' + # $2 = 'mm' + # $3 = 'ing republic of Perl' + +Just as in the previous regexp, the first quantifier C<.??> can match +earliest at position C<'a'>, so it does. The second quantifier is +greedy, so it matches C<mm>, and the third matches the rest of the +string. + +We can modify principle 3 above to take into account non-greedy +quantifiers: + +=over 4 + +=item * + +Principle 3: If there are two or more elements in a regexp, the +leftmost greedy (non-greedy) quantifier, if any, will match as much +(little) of the string as possible while still allowing the whole +regexp to match. The next leftmost greedy (non-greedy) quantifier, if +any, will try to match as much (little) of the string remaining +available to it as possible, while still allowing the whole regexp to +match. And so on, until all the regexp elements are satisfied. + +=back + +Just like alternation, quantifiers are also susceptible to +backtracking. Here is a step-by-step analysis of the example + + $x = "the cat in the hat"; + $x =~ /^(.*)(at)(.*)$/; # matches, + # $1 = 'the cat in the h' + # $2 = 'at' + # $3 = '' (0 matches) + +=over 4 + +=item 0 + +Start with the first letter in the string 't'. + +=item 1 + +The first quantifier '.*' starts out by matching the whole +string 'the cat in the hat'. + +=item 2 + +'a' in the regexp element 'at' doesn't match the end of the +string. Backtrack one character. + +=item 3 + +'a' in the regexp element 'at' still doesn't match the last +letter of the string 't', so backtrack one more character. + +=item 4 + +Now we can match the 'a' and the 't'. + +=item 5 + +Move on to the third element '.*'. Since we are at the end of +the string and '.*' can match 0 times, assign it the empty string. + +=item 6 + +We are done! + +=back + +Most of the time, all this moving forward and backtracking happens +quickly and searching is fast. There are some pathological regexps, +however, whose execution time exponentially grows with the size of the +string. A typical structure that blows up in your face is of the form + + /(a|b+)*/; + +The problem is the nested indeterminate quantifiers. There are many +different ways of partitioning a string of length n between the C<+> +and C<*>: one repetition with C<b+> of length n, two repetitions with +the first C<b+> length k and the second with length n-k, m repetitions +whose bits add up to length n, etc. In fact there are an exponential +number of ways to partition a string as a function of length. A +regexp may get lucky and match early in the process, but if there is +no match, perl will try I<every> possibility before giving up. So be +careful with nested C<*>'s, C<{n,m}>'s, and C<+>'s. The book +I<Mastering regular expressions> by Jeffrey Friedl gives a wonderful +discussion of this and other efficiency issues. + +=head2 Building a regexp + +At this point, we have all the basic regexp concepts covered, so let's +give a more involved example of a regular expression. We will build a +regexp that matches numbers. + +The first task in building a regexp is to decide what we want to match +and what we want to exclude. In our case, we want to match both +integers and floating point numbers and we want to reject any string +that isn't a number. + +The next task is to break the problem down into smaller problems that +are easily converted into a regexp. + +The simplest case is integers. These consist of a sequence of digits, +with an optional sign in front. The digits we can represent with +C<\d+> and the sign can be matched with C<[+-]>. Thus the integer +regexp is + + /[+-]?\d+/; # matches integers + +A floating point number potentially has a sign, an integral part, a +decimal point, a fractional part, and an exponent. One or more of these +parts is optional, so we need to check out the different +possibilities. Floating point numbers which are in proper form include +123., 0.345, .34, -1e6, and 25.4E-72. As with integers, the sign out +front is completely optional and can be matched by C<[+-]?>. We can +see that if there is no exponent, floating point numbers must have a +decimal point, otherwise they are integers. We might be tempted to +model these with C<\d*\.\d*>, but this would also match just a single +decimal point, which is not a number. So the three cases of floating +point number sans exponent are + + /[+-]?\d+\./; # 1., 321., etc. + /[+-]?\.\d+/; # .1, .234, etc. + /[+-]?\d+\.\d+/; # 1.0, 30.56, etc. + +These can be combined into a single regexp with a three-way alternation: + + /[+-]?(\d+\.\d+|\d+\.|\.\d+)/; # floating point, no exponent + +In this alternation, it is important to put C<'\d+\.\d+'> before +C<'\d+\.'>. If C<'\d+\.'> were first, the regexp would happily match that +and ignore the fractional part of the number. + +Now consider floating point numbers with exponents. The key +observation here is that I<both> integers and numbers with decimal +points are allowed in front of an exponent. Then exponents, like the +overall sign, are independent of whether we are matching numbers with +or without decimal points, and can be 'decoupled' from the +mantissa. The overall form of the regexp now becomes clear: + + /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/; + +The exponent is an C<e> or C<E>, followed by an integer. So the +exponent regexp is + + /[eE][+-]?\d+/; # exponent + +Putting all the parts together, we get a regexp that matches numbers: + + /^[+-]?(\d+\.\d+|\d+\.|\.\d+|\d+)([eE][+-]?\d+)?$/; # Ta da! + +Long regexps like this may impress your friends, but can be hard to +decipher. In complex situations like this, the C<//x> modifier for a +match is invaluable. It allows one to put nearly arbitrary whitespace +and comments into a regexp without affecting their meaning. Using it, +we can rewrite our 'extended' regexp in the more pleasing form + + /^ + [+-]? # first, match an optional sign + ( # then match integers or f.p. mantissas: + \d+\.\d+ # mantissa of the form a.b + |\d+\. # mantissa of the form a. + |\.\d+ # mantissa of the form .b + |\d+ # integer of the form a + ) + ([eE][+-]?\d+)? # finally, optionally match an exponent + $/x; + +If whitespace is mostly irrelevant, how does one include space +characters in an extended regexp? The answer is to backslash it +S<C<'\ '> > or put it in a character class S<C<[ ]> >. The same thing +goes for pound signs, use C<\#> or C<[#]>. For instance, Perl allows +a space between the sign and the mantissa/integer, and we could add +this to our regexp as follows: + + /^ + [+-]?\ * # first, match an optional sign *and space* + ( # then match integers or f.p. mantissas: + \d+\.\d+ # mantissa of the form a.b + |\d+\. # mantissa of the form a. + |\.\d+ # mantissa of the form .b + |\d+ # integer of the form a + ) + ([eE][+-]?\d+)? # finally, optionally match an exponent + $/x; + +In this form, it is easier to see a way to simplify the +alternation. Alternatives 1, 2, and 4 all start with C<\d+>, so it +could be factored out: + + /^ + [+-]?\ * # first, match an optional sign + ( # then match integers or f.p. mantissas: + \d+ # start out with a ... + ( + \.\d* # mantissa of the form a.b or a. + )? # ? takes care of integers of the form a + |\.\d+ # mantissa of the form .b + ) + ([eE][+-]?\d+)? # finally, optionally match an exponent + $/x; + +or written in the compact form, + + /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; + +This is our final regexp. To recap, we built a regexp by + +=over 4 + +=item * + +specifying the task in detail, + +=item * + +breaking down the problem into smaller parts, + +=item * + +translating the small parts into regexps, + +=item * + +combining the regexps, + +=item * + +and optimizing the final combined regexp. + +=back + +These are also the typical steps involved in writing a computer +program. This makes perfect sense, because regular expressions are +essentially programs written a little computer language that specifies +patterns. + +=head2 Using regular expressions in Perl + +The last topic of Part 1 briefly covers how regexps are used in Perl +programs. Where do they fit into Perl syntax? + +We have already introduced the matching operator in its default +C</regexp/> and arbitrary delimiter C<m!regexp!> forms. We have used +the binding operator C<=~> and its negation C<!~> to test for string +matches. Associated with the matching operator, we have discussed the +single line C<//s>, multi-line C<//m>, case-insensitive C<//i> and +extended C<//x> modifiers. + +There are a few more things you might want to know about matching +operators. First, we pointed out earlier that variables in regexps are +substituted before the regexp is evaluated: + + $pattern = 'Seuss'; + while (<>) { + print if /$pattern/; + } + +This will print any lines containing the word C<Seuss>. It is not as +efficient as it could be, however, because perl has to re-evaluate +C<$pattern> each time through the loop. If C<$pattern> won't be +changing over the lifetime of the script, we can add the C<//o> +modifier, which directs perl to only perform variable substitutions +once: + + #!/usr/bin/perl + # Improved simple_grep + $regexp = shift; + while (<>) { + print if /$regexp/o; # a good deal faster + } + +If you change C<$pattern> after the first substitution happens, perl +will ignore it. If you don't want any substitutions at all, use the +special delimiter C<m''>: + + $pattern = 'Seuss'; + while (<>) { + print if m'$pattern'; # matches '$pattern', not 'Seuss' + } + +C<m''> acts like single quotes on a regexp; all other C<m> delimiters +act like double quotes. If the regexp evaluates to the empty string, +the regexp in the I<last successful match> is used instead. So we have + + "dog" =~ /d/; # 'd' matches + "dogbert =~ //; # this matches the 'd' regexp used before + +The final two modifiers C<//g> and C<//c> concern multiple matches. +The modifier C<//g> stands for global matching and allows the the +matching operator to match within a string as many times as possible. +In scalar context, successive invocations against a string will have +`C<//g> jump from match to match, keeping track of position in the +string as it goes along. You can get or set the position with the +C<pos()> function. + +The use of C<//g> is shown in the following example. Suppose we have +a string that consists of words separated by spaces. If we know how +many words there are in advance, we could extract the words using +groupings: + + $x = "cat dog house"; # 3 words + $x =~ /^\s*(\w+)\s+(\w+)\s+(\w+)\s*$/; # matches, + # $1 = 'cat' + # $2 = 'dog' + # $3 = 'house' + +But what if we had an indeterminate number of words? This is the sort +of task C<//g> was made for. To extract all words, form the simple +regexp C<(\w+)> and loop over all matches with C</(\w+)/g>: + + while ($x =~ /(\w+)/g) { + print "Word is $1, ends at position ", pos $x, "\n"; + } + +prints + + Word is cat, ends at position 3 + Word is dog, ends at position 7 + Word is house, ends at position 13 + +A failed match or changing the target string resets the position. If +you don't want the position reset after failure to match, add the +C<//c>, as in C</regexp/gc>. The current position in the string is +associated with the string, not the regexp. This means that different +strings have different positions and their respective positions can be +set or read independently. + +In list context, C<//g> returns a list of matched groupings, or if +there are no groupings, a list of matches to the whole regexp. So if +we wanted just the words, we could use + + @words = ($x =~ /(\w+)/g); # matches, + # $word[0] = 'cat' + # $word[1] = 'dog' + # $word[2] = 'house' + +Closely associated with the C<//g> modifier is the C<\G> anchor. The +C<\G> anchor matches at the point where the previous C<//g> match left +off. C<\G> allows us to easily do context-sensitive matching: + + $metric = 1; # use metric units + ... + $x = <FILE>; # read in measurement + $x =~ /^([+-]?\d+)\s*/g; # get magnitude + $weight = $1; + if ($metric) { # error checking + print "Units error!" unless $x =~ /\Gkg\./g; + } + else { + print "Units error!" unless $x =~ /\Glbs\./g; + } + $x =~ /\G\s+(widget|sprocket)/g; # continue processing + +The combination of C<//g> and C<\G> allows us to process the string a +bit at a time and use arbitrary Perl logic to decide what to do next. + +C<\G> is also invaluable in processing fixed length records with +regexps. Suppose we have a snippet of coding region DNA, encoded as +base pair letters C<ATCGTTGAAT...> and we want to find all the stop +codons C<TGA>. In a coding region, codons are 3-letter sequences, so +we can think of the DNA snippet as a sequence of 3-letter records. The +naive regexp + + # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC" + $dna = "ATCGTTGAATGCAAATGACATGAC"; + $dna =~ /TGA/; + +doesn't work; it may match an C<TGA>, but there is no guarantee that +the match is aligned with codon boundaries, e.g., the substring +S<C<GTT GAA> > gives a match. A better solution is + + while ($dna =~ /(\w\w\w)*?TGA/g) { # note the minimal *? + print "Got a TGA stop codon at position ", pos $dna, "\n"; + } + +which prints + + Got a TGA stop codon at position 18 + Got a TGA stop codon at position 23 + +Position 18 is good, but position 23 is bogus. What happened? + +The answer is that our regexp works well until we get past the last +real match. Then the regexp will fail to match a synchronized C<TGA> +and start stepping ahead one character position at a time, not what we +want. The solution is to use C<\G> to anchor the match to the codon +alignment: + + while ($dna =~ /\G(\w\w\w)*?TGA/g) { + print "Got a TGA stop codon at position ", pos $dna, "\n"; + } + +This prints + + Got a TGA stop codon at position 18 + +which is the correct answer. This example illustrates that it is +important not only to match what is desired, but to reject what is not +desired. + +B<search and replace> + +Regular expressions also play a big role in B<search and replace> +operations in Perl. Search and replace is accomplished with the +C<s///> operator. The general form is +C<s/regexp/replacement/modifiers>, with everything we know about +regexps and modifiers applying in this case as well. The +C<replacement> is a Perl double quoted string that replaces in the +string whatever is matched with the C<regexp>. The operator C<=~> is +also used here to associate a string with C<s///>. If matching +against C<$_>, the S<C<$_ =~> > can be dropped. If there is a match, +C<s///> returns the number of substitutions made, otherwise it returns +false. Here are a few examples: + + $x = "Time to feed the cat!"; + $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!" + if ($x =~ s/^(Time.*hacker)!$/$1 now!/) { + $more_insistent = 1; + } + $y = "'quoted words'"; + $y =~ s/^'(.*)'$/$1/; # strip single quotes, + # $y contains "quoted words" + +In the last example, the whole string was matched, but only the part +inside the single quotes was grouped. With the C<s///> operator, the +matched variables C<$1>, C<$2>, etc. are immediately available for use +in the replacement expression, so we use C<$1> to replace the quoted +string with just what was quoted. With the global modifier, C<s///g> +will search and replace all occurrences of the regexp in the string: + + $x = "I batted 4 for 4"; + $x =~ s/4/four/; # doesn't do it all: + # $x contains "I batted four for 4" + $x = "I batted 4 for 4"; + $x =~ s/4/four/g; # does it all: + # $x contains "I batted four for four" + +If you prefer 'regex' over 'regexp' in this tutorial, you could use +the following program to replace it: + + % cat > simple_replace + #!/usr/bin/perl + $regexp = shift; + $replacement = shift; + while (<>) { + s/$regexp/$replacement/go; + print; + } + ^D + + % simple_replace regexp regex perlretut.pod + +In C<simple_replace> we used the C<s///g> modifier to replace all +occurrences of the regexp on each line and the C<s///o> modifier to +compile the regexp only once. As with C<simple_grep>, both the +C<print> and the C<s/$regexp/$replacement/go> use C<$_> implicitly. + +A modifier available specifically to search and replace is the +C<s///e> evaluation modifier. C<s///e> wraps an C<eval{...}> around +the replacement string and the evaluated result is substituted for the +matched substring. C<s///e> is useful if you need to do a bit of +computation in the process of replacing text. This example counts +character frequencies in a line: + + $x = "Bill the cat"; + $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself + print "frequency of '$_' is $chars{$_}\n" + foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars); + +This prints + + frequency of ' ' is 2 + frequency of 't' is 2 + frequency of 'l' is 2 + frequency of 'B' is 1 + frequency of 'c' is 1 + frequency of 'e' is 1 + frequency of 'h' is 1 + frequency of 'i' is 1 + frequency of 'a' is 1 + +As with the match C<m//> operator, C<s///> can use other delimiters, +such as C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are +used C<s'''>, then the regexp and replacement are treated as single +quoted strings and there are no substitutions. C<s///> in list context +returns the same thing as in scalar context, i.e., the number of +matches. + +B<The split operator> + +The B<C<split> > function can also optionally use a matching operator +C<m//> to split a string. C<split /regexp/, string, limit> splits +C<string> into a list of substrings and returns that list. The regexp +is used to match the character sequence that the C<string> is split +with respect to. The C<limit>, if present, constrains splitting into +no more than C<limit> number of strings. For example, to split a +string into words, use + + $x = "Calvin and Hobbes"; + @words = split /\s+/, $x; # $word[0] = 'Calvin' + # $word[1] = 'and' + # $word[2] = 'Hobbes' + +If the empty regexp C<//> is used, the regexp always matches and +the string is split into individual characters. If the regexp has +groupings, then list produced contains the matched substrings from the +groupings as well. For instance, + + $x = "/usr/bin/perl"; + @dirs = split m!/!, $x; # $dirs[0] = '' + # $dirs[1] = 'usr' + # $dirs[2] = 'bin' + # $dirs[3] = 'perl' + @parts = split m!(/)!, $x; # $parts[0] = '' + # $parts[1] = '/' + # $parts[2] = 'usr' + # $parts[3] = '/' + # $parts[4] = 'bin' + # $parts[5] = '/' + # $parts[6] = 'perl' + +Since the first character of $x matched the regexp, C<split> prepended +an empty initial element to the list. + +If you have read this far, congratulations! You now have all the basic +tools needed to use regular expressions to solve a wide range of text +processing problems. If this is your first time through the tutorial, +why not stop here and play around with regexps a while... S<Part 2> +concerns the more esoteric aspects of regular expressions and those +concepts certainly aren't needed right at the start. + +=head1 Part 2: Power tools + +OK, you know the basics of regexps and you want to know more. If +matching regular expressions is analogous to a walk in the woods, then +the tools discussed in Part 1 are analogous to topo maps and a +compass, basic tools we use all the time. Most of the tools in part 2 +are are analogous to flare guns and satellite phones. They aren't used +too often on a hike, but when we are stuck, they can be invaluable. + +What follows are the more advanced, less used, or sometimes esoteric +capabilities of perl regexps. In Part 2, we will assume you are +comfortable with the basics and concentrate on the new features. + +=head2 More on characters, strings, and character classes + +There are a number of escape sequences and character classes that we +haven't covered yet. + +There are several escape sequences that convert characters or strings +between upper and lower case. C<\l> and C<\u> convert the next +character to lower or upper case, respectively: + + $x = "perl"; + $string =~ /\u$x/; # matches 'Perl' in $string + $x = "M(rs?|s)\\."; # note the double backslash + $string =~ /\l$x/; # matches 'mr.', 'mrs.', and 'ms.', + +C<\L> and C<\U> converts a whole substring, delimited by C<\L> or +C<\U> and C<\E>, to lower or upper case: + + $x = "This word is in lower case:\L SHOUT\E"; + $x =~ /shout/; # matches + $x = "I STILL KEYPUNCH CARDS FOR MY 360" + $x =~ /\Ukeypunch/; # matches punch card string + +If there is no C<\E>, case is converted until the end of the +string. The regexps C<\L\u$word> or C<\u\L$word> convert the first +character of C<$word> to uppercase and the rest of the characters to +lowercase. + +Control characters can be escaped with C<\c>, so that a control-Z +character would be matched with C<\cZ>. The escape sequence +C<\Q>...C<\E> quotes, or protects most non-alphabetic characters. For +instance, + + $x = "\QThat !^*&%~& cat!"; + $x =~ /\Q!^*&%~&\E/; # check for rough language + +It does not protect C<$> or C<@>, so that variables can still be +substituted. + +With the advent of 5.6.0, perl regexps can handle more than just the +standard ASCII character set. Perl now supports B<Unicode>, a standard +for encoding the character sets from many of the world's written +languages. Unicode does this by allowing characters to be more than +one byte wide. Perl uses the UTF-8 encoding, in which ASCII characters +are still encoded as one byte, but characters greater than C<chr(127)> +may be stored as two or more bytes. + +What does this mean for regexps? Well, regexp users don't need to know +much about perl's internal representation of strings. But they do need +to know 1) how to represent Unicode characters in a regexp and 2) when +a matching operation will treat the string to be searched as a +sequence of bytes (the old way) or as a sequence of Unicode characters +(the new way). The answer to 1) is that Unicode characters greater +than C<chr(127)> may be represented using the C<\x{hex}> notation, +with C<hex> a hexadecimal integer: + + use utf8; # We will be doing Unicode processing + /\x{263a}/; # match a Unicode smiley face :) + +Unicode characters in the range of 128-255 use two hexadecimal digits +with braces: C<\x{ab}>. Note that this is different than C<\xab>, +which is just a hexadecimal byte with no Unicode +significance. + +Figuring out the hexadecimal sequence of a Unicode character you want +or deciphering someone else's hexadecimal Unicode regexp is about as +much fun as programming in machine code. So another way to specify +Unicode characters is to use the S<B<named character> > escape +sequence C<\N{name}>. C<name> is a name for the Unicode character, as +specified in the Unicode standard. For instance, if we wanted to +represent or match the astrological sign for the planet Mercury, we +could use + + use utf8; # We will be doing Unicode processing + use charnames ":full"; # use named chars with Unicode full names + $x = "abc\N{MERCURY}def"; + $x =~ /\N{MERCURY}/; # matches + +One can also use short names or restrict names to a certain alphabet: + + use utf8; # We will be doing Unicode processing + + use charnames ':full'; + print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; + + use charnames ":short"; + print "\N{greek:Sigma} is an upper-case sigma.\n"; + + use charnames qw(greek); + print "\N{sigma} is Greek sigma\n"; + +A list of full names is found in the file Names.txt in the +lib/perl5/5.6.0/unicode directory. + +The answer to requirement 2), as of 5.6.0, is that if a regexp +contains Unicode characters, the string is searched as a sequence of +Unicode characters. Otherwise, the string is searched as a sequence of +bytes. If the string is being searched as a sequence of Unicode +characters, but matching a single byte is required, we can use the C<\C> +escape sequence. C<\C> is a character class akin to C<.> except that +it matches I<any> byte 0-255. So + + use utf8; # We will be doing Unicode processing + use charnames ":full"; # use named chars with Unicode full names + $x = "a"; + $x =~ /\C/; # matches 'a', eats one byte + $x = ""; + $x =~ /\C/; # doesn't match, no bytes to match + $x = "\N{MERCURY}"; # two-byte Unicode character + $x =~ /\C/; # matches, but dangerous! + +The last regexp matches, but is dangerous because the string +I<character> position is no longer synchronized to the string I<byte> +position. This generates the warning 'Malformed UTF-8 +character'. C<\C> is best used for matching the binary data in strings +with binary data intermixed with Unicode characters. + +Let us now discuss the rest of the character classes. Just as with +Unicode characters, there are named Unicode character classes +represented by the C<\p{name}> escape sequence. Closely associated is +the C<\P{name}> character class, which is the negation of the +C<\p{name}> class. For example, to match lower and uppercase +characters, + + use utf8; # We will be doing Unicode processing + use charnames ":full"; # use named chars with Unicode full names + $x = "BOB"; + $x =~ /^\p{IsUpper}/; # matches, uppercase char class + $x =~ /^\P{IsUpper}/; # doesn't match, char class sans uppercase + $x =~ /^\p{IsLower}/; # doesn't match, lowercase char class + $x =~ /^\P{IsLower}/; # matches, char class sans lowercase + +Here is the association between some Perl named classes and the +traditional Unicode classes: + + Perl class name Unicode class name or regular expression + + IsAlpha /^[LM]/ + IsAlnum /^[LMN]/ + IsASCII $code <= 127 + IsCntrl /^C/ + IsBlank $code =~ /^(0020|0009)$/ || /^Z[^lp]/ + IsDigit Nd + IsGraph /^([LMNPS]|Co)/ + IsLower Ll + IsPrint /^([LMNPS]|Co|Zs)/ + IsPunct /^P/ + IsSpace /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/ + IsSpacePerl /^Z/ || ($code =~ /^(0009|000A|000C|000D)$/ + IsUpper /^L[ut]/ + IsWord /^[LMN]/ || $code eq "005F" + IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/ + +You can also use the official Unicode class names with the C<\p> and +C<\P>, like C<\p{L}> for Unicode 'letters', or C<\p{Lu}> for uppercase +letters, or C<\P{Nd}> for non-digits. If a C<name> is just one +letter, the braces can be dropped. For instance, C<\pM> is the +character class of Unicode 'marks'. + +C<\X> is an abbreviation for a character class sequence that includes +the Unicode 'combining character sequences'. A 'combining character +sequence' is a base character followed by any number of combining +characters. An example of a combining character is an accent. Using +the Unicode full names, e.g., S<C<A + COMBINING RING> > is a combining +character sequence with base character C<A> and combining character +S<C<COMBINING RING> >, which translates in Danish to A with the circle +atop it, as in the word Angstrom. C<\X> is equivalent to C<\PM\pM*}>, +i.e., a non-mark followed by one or more marks. + +As if all those classes weren't enough, Perl also defines POSIX style +character classes. These have the form C<[:name:]>, with C<name> the +name of the POSIX class. The POSIX classes are C<alpha>, C<alnum>, +C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>, +C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl +extension to match C<\w>), and C<blank> (a GNU extension). If C<utf8> +is being used, then these classes are defined the same as their +corresponding perl Unicode classes: C<[:upper:]> is the same as +C<\p{IsUpper}>, etc. The POSIX character classes, however, don't +require using C<utf8>. The C<[:digit:]>, C<[:word:]>, and +C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s> +character classes. To negate a POSIX class, put a C<^> in front of +the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under +C<utf8>, C<\P{IsDigit}>. The Unicode and POSIX character classes can +be used just like C<\d>, both inside and outside of character classes: + + /\s+[abc[:digit:]xyz]\s*/; # match a,b,c,x,y,z, or a digit + /^=item\s[:digit:]/; # match '=item', + # followed by a space and a digit + use utf8; + use charnames ":full"; + /\s+[abc\p{IsDigit}xyz]\s+/; # match a,b,c,x,y,z, or a digit + /^=item\s\p{IsDigit}/; # match '=item', + # followed by a space and a digit + +Whew! That is all the rest of the characters and character classes. + +=head2 Compiling and saving regular expressions + +In Part 1 we discussed the C<//o> modifier, which compiles a regexp +just once. This suggests that a compiled regexp is some data structure +that can be stored once and used again and again. The regexp quote +C<qr//> does exactly that: C<qr/string/> compiles the C<string> as a +regexp and transforms the result into a form that can be assigned to a +variable: + + $reg = qr/foo+bar?/; # reg contains a compiled regexp + +Then C<$reg> can be used as a regexp: + + $x = "fooooba"; + $x =~ $reg; # matches, just like /foo+bar?/ + $x =~ /$reg/; # same thing, alternate form + +C<$reg> can also be interpolated into a larger regexp: + + $x =~ /(abc)?$reg/; # still matches + +As with the matching operator, the regexp quote can use different +delimiters, e.g., C<qr!!>, C<qr{}> and C<qr~~>. The single quote +delimiters C<qr''> prevent any interpolation from taking place. + +Pre-compiled regexps are useful for creating dynamic matches that +don't need to be recompiled each time they are encountered. Using +pre-compiled regexps, C<simple_grep> program can be expanded into a +program that matches multiple patterns: + + % cat > multi_grep + #!/usr/bin/perl + # multi_grep - match any of <number> regexps + # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ... + + $number = shift; + $regexp[$_] = shift foreach (0..$number-1); + @compiled = map qr/$_/, @regexp; + while ($line = <>) { + foreach $pattern (@compiled) { + if ($line =~ /$pattern/) { + print $line; + last; # we matched, so move onto the next line + } + } + } + ^D + + % multi_grep 2 last for multi_grep + $regexp[$_] = shift foreach (0..$number-1); + foreach $pattern (@compiled) { + last; + +Storing pre-compiled regexps in an array C<@compiled> allows us to +simply loop through the regexps without any recompilation, thus gaining +flexibility without sacrificing speed. + +=head2 Embedding comments and modifiers in a regular expression + +Starting with this section, we will be discussing Perl's set of +B<extended patterns>. These are extensions to the traditional regular +expression syntax that provide powerful new tools for pattern +matching. We have already seen extensions in the form of the minimal +matching constructs C<??>, C<*?>, C<+?>, C<{n,m}?>, and C<{n,}?>. The +rest of the extensions below have the form C<(?char...)>, where the +C<char> is a character that determines the type of extension. + +The first extension is an embedded comment C<(?#text)>. This embeds a +comment into the regular expression without affecting its meaning. The +comment should not have any closing parentheses in the text. An +example is + + /(?# Match an integer:)[+-]?\d+/; + +This style of commenting has been largely superseded by the raw, +freeform commenting that is allowed with the C<//x> modifier. + +The modifiers C<//i>, C<//m>, C<//s>, and C<//x> can also embedded in +a regexp using C<(?i)>, C<(?m)>, C<(?s)>, and C<(?x)>. For instance, + + /(?i)yes/; # match 'yes' case insensitively + /yes/i; # same thing + /(?x)( # freeform version of an integer regexp + [+-]? # match an optional sign + \d+ # match a sequence of digits + ) + /x; + +Embedded modifiers can have two important advantages over the usual +modifiers. Embedded modifiers allow a custom set of modifiers to +I<each> regexp pattern. This is great for matching an array of regexps +that must have different modifiers: + + $pattern[0] = '(?i)doctor'; + $pattern[1] = 'Johnson'; + ... + while (<>) { + foreach $patt (@pattern) { + print if /$patt/; + } + } + +The second advantage is that embedded modifiers only affect the regexp +inside the group the embedded modifier is contained in. So grouping +can be used to localize the modifier's effects: + + /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc. + +Embedded modifiers can also turn off any modifiers already present +by using, e.g., C<(?-i)>. Modifiers can also be combined into +a single expression, e.g., C<(?s-i)> turns on single line mode and +turns off case insensitivity. + +=head2 Non-capturing groupings + +We noted in Part 1 that groupings C<()> had two distinct functions: 1) +group regexp elements together as a single unit, and 2) extract, or +capture, substrings that matched the regexp in the +grouping. Non-capturing groupings, denoted by C<(?:regexp)>, allow the +regexp to be treated as a single unit, but don't extract substrings or +set matching variables C<$1>, etc. Both capturing and non-capturing +groupings are allowed to co-exist in the same regexp. Because there is +no extraction, non-capturing groupings are faster than capturing +groupings. Non-capturing groupings are also handy for choosing exactly +which parts of a regexp are to be extracted to matching variables: + + # match a number, $1-$4 are set, but we only want $1 + /([+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)/; + + # match a number faster , only $1 is set + /([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)/; + + # match a number, get $1 = whole number, $2 = exponent + /([+-]?\ *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE]([+-]?\d+))?)/; + +Non-capturing groupings are also useful for removing nuisance +elements gathered from a split operation: + + $x = '12a34b5'; + @num = split /(a|b)/, $x; # @num = ('12','a','34','b','5') + @num = split /(?:a|b)/, $x; # @num = ('12','34','5') + +Non-capturing groupings may also have embedded modifiers: +C<(?i-m:regexp)> is a non-capturing grouping that matches C<regexp> +case insensitively and turns off multi-line mode. + +=head2 Looking ahead and looking behind + +This section concerns the lookahead and lookbehind assertions. First, +a little background. + +In Perl regular expressions, most regexp elements 'eat up' a certain +amount of string when they match. For instance, the regexp element +C<[abc}]> eats up one character of the string when it matches, in the +sense that perl moves to the next character position in the string +after the match. There are some elements, however, that don't eat up +characters (advance the character position) if they match. The examples +we have seen so far are the anchors. The anchor C<^> matches the +beginning of the line, but doesn't eat any characters. Similarly, the +word boundary anchor C<\b> matches, e.g., if the character to the left +is a word character and the character to the right is a non-word +character, but it doesn't eat up any characters itself. Anchors are +examples of 'zero-width assertions'. Zero-width, because they consume +no characters, and assertions, because they test some property of the +string. In the context of our walk in the woods analogy to regexp +matching, most regexp elements move us along a trail, but anchors have +us stop a moment and check our surroundings. If the local environment +checks out, we can proceed forward. But if the local environment +doesn't satisfy us, we must backtrack. + +Checking the environment entails either looking ahead on the trail, +looking behind, or both. C<^> looks behind, to see that there are no +characters before. C<$> looks ahead, to see that there are no +characters after. C<\b> looks both ahead and behind, to see if the +characters on either side differ in their 'word'-ness. + +The lookahead and lookbehind assertions are generalizations of the +anchor concept. Lookahead and lookbehind are zero-width assertions +that let us specify which characters we want to test for. The +lookahead assertion is denoted by C<(?=regexp)> and the lookbehind +assertion is denoted by C<< (?<=fixed-regexp) >>. Some examples are + + $x = "I catch the housecat 'Tom-cat' with catnip"; + $x =~ /cat(?=\s+)/; # matches 'cat' in 'housecat' + @catwords = ($x =~ /(?<=\s)cat\w+/g); # matches, + # $catwords[0] = 'catch' + # $catwords[1] = 'catnip' + $x =~ /\bcat\b/; # matches 'cat' in 'Tom-cat' + $x =~ /(?<=\s)cat(?=\s)/; # doesn't match; no isolated 'cat' in + # middle of $x + +Note that the parentheses in C<(?=regexp)> and C<< (?<=regexp) >> are +non-capturing, since these are zero-width assertions. Thus in the +second regexp, the substrings captured are those of the whole regexp +itself. Lookahead C<(?=regexp)> can match arbitrary regexps, but +lookbehind C<< (?<=fixed-regexp) >> only works for regexps of fixed +width, i.e., a fixed number of characters long. Thus +C<< (?<=(ab|bc)) >> is fine, but C<< (?<=(ab)*) >> is not. The +negated versions of the lookahead and lookbehind assertions are +denoted by C<(?!regexp)> and C<< (?<!fixed-regexp) >> respectively. +They evaluate true if the regexps do I<not> match: + + $x = "foobar"; + $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo' + $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo' + $x =~ /(?<!\s)foo/; # matches, there is no \s before 'foo' + +=head2 Using independent subexpressions to prevent backtracking + +The last few extended patterns in this tutorial are experimental as of +5.6.0. Play with them, use them in some code, but don't rely on them +just yet for production code. + +S<B<Independent subexpressions> > are regular expressions, in the +context of a larger regular expression, that function independently of +the larger regular expression. That is, they consume as much or as +little of the string as they wish without regard for the ability of +the larger regexp to match. Independent subexpressions are represented +by C<< (?>regexp) >>. We can illustrate their behavior by first +considering an ordinary regexp: + + $x = "ab"; + $x =~ /a*ab/; # matches + +This obviously matches, but in the process of matching, the +subexpression C<a*> first grabbed the C<a>. Doing so, however, +wouldn't allow the whole regexp to match, so after backtracking, C<a*> +eventually gave back the C<a> and matched the empty string. Here, what +C<a*> matched was I<dependent> on what the rest of the regexp matched. + +Contrast that with an independent subexpression: + + $x =~ /(?>a*)ab/; # doesn't match! + +The independent subexpression C<< (?>a*) >> doesn't care about the rest +of the regexp, so it sees an C<a> and grabs it. Then the rest of the +regexp C<ab> cannot match. Because C<< (?>a*) >> is independent, there +is no backtracking and and the independent subexpression does not give +up its C<a>. Thus the match of the regexp as a whole fails. A similar +behavior occurs with completely independent regexps: + + $x = "ab"; + $x =~ /a*/g; # matches, eats an 'a' + $x =~ /\Gab/g; # doesn't match, no 'a' available + +Here C<//g> and C<\G> create a 'tag team' handoff of the string from +one regexp to the other. Regexps with an independent subexpression are +much like this, with a handoff of the string to the independent +subexpression, and a handoff of the string back to the enclosing +regexp. + +The ability of an independent subexpression to prevent backtracking +can be quite useful. Suppose we want to match a non-empty string +enclosed in parentheses up to two levels deep. Then the following +regexp matches: + + $x = "abc(de(fg)h"; # unbalanced parentheses + $x =~ /\( ( [^()]+ | \([^()]*\) )+ \)/x; + +The regexp matches an open parenthesis, one or more copies of an +alternation, and a close parenthesis. The alternation is two-way, with +the first alternative C<[^()]+> matching a substring with no +parentheses and the second alternative C<\([^()]*\)> matching a +substring delimited by parentheses. The problem with this regexp is +that it is pathological: it has nested indeterminate quantifiers + of the form C<(a+|b)+>. We discussed in Part 1 how nested quantifiers +like this could take an exponentially long time to execute if there +was no match possible. To prevent the exponential blowup, we need to +prevent useless backtracking at some point. This can be done by +enclosing the inner quantifier as an independent subexpression: + + $x =~ /\( ( (?>[^()]+) | \([^()]*\) )+ \)/x; + +Here, C<< (?>[^()]+) >> breaks the degeneracy of string partitioning +by gobbling up as much of the string as possible and keeping it. Then +match failures fail much more quickly. + +=head2 Conditional expressions + +A S<B<conditional expression> > is a form of if-then-else statement +that allows one to choose which patterns are to be matched, based on +some condition. There are two types of conditional expression: +C<(?(condition)yes-regexp)> and +C<(?(condition)yes-regexp|no-regexp)>. C<(?(condition)yes-regexp)> is +like an S<C<'if () {}'> > statement in Perl. If the C<condition> is true, +the C<yes-regexp> will be matched. If the C<condition> is false, the +C<yes-regexp> will be skipped and perl will move onto the next regexp +element. The second form is like an S<C<'if () {} else {}'> > statement +in Perl. If the C<condition> is true, the C<yes-regexp> will be +matched, otherwise the C<no-regexp> will be matched. + +The C<condition> can have two forms. The first form is simply an +integer in parentheses C<(integer)>. It is true if the corresponding +backreference C<\integer> matched earlier in the regexp. The second +form is a bare zero width assertion C<(?...)>, either a +lookahead, a lookbehind, or a code assertion (discussed in the next +section). + +The integer form of the C<condition> allows us to choose, with more +flexibility, what to match based on what matched earlier in the +regexp. This searches for words of the form C<"$x$x"> or +C<"$x$y$y$x">: + + % simple_grep '^(\w+)(\w+)?(?(2)\2\1|\1)$' /usr/dict/words + beriberi + coco + couscous + deed + ... + toot + toto + tutu + +The lookbehind C<condition> allows, along with backreferences, +an earlier part of the match to influence a later part of the +match. For instance, + + /[ATGC]+(?(?<=AA)G|C)$/; + +matches a DNA sequence such that it either ends in C<AAG>, or some +other base pair combination and C<C>. Note that the form is +C<< (?(?<=AA)G|C) >> and not C<< (?((?<=AA))G|C) >>; for the +lookahead, lookbehind or code assertions, the parentheses around the +conditional are not needed. + +=head2 A bit of magic: executing Perl code in a regular expression + +Normally, regexps are a part of Perl expressions. +S<B<Code evaluation> > expressions turn that around by allowing +arbitrary Perl code to be a part of of a regexp. A code evaluation +expression is denoted C<(?{code})>, with C<code> a string of Perl +statements. + +Code expressions are zero-width assertions, and the value they return +depends on their environment. There are two possibilities: either the +code expression is used as a conditional in a conditional expression +C<(?(condition)...)>, or it is not. If the code expression is a +conditional, the code is evaluated and the result (i.e., the result of +the last statement) is used to determine truth or falsehood. If the +code expression is not used as a conditional, the assertion always +evaluates true and the result is put into the special variable +C<$^R>. The variable C<$^R> can then be used in code expressions later +in the regexp. Here are some silly examples: + + $x = "abcdef"; + $x =~ /abc(?{print "Hi Mom!";})def/; # matches, + # prints 'Hi Mom!' + $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match, + # no 'Hi Mom!' + +Pay careful attention to the next example: + + $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match, + # no 'Hi Mom!' + # but why not? + +At first glance, you'd think that it shouldn't print, because obviously +the C<ddd> isn't going to match the target string. But look at this +example: + + $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match, + # but _does_ print + +Hmm. What happened here? If you've been following along, you know that +the above pattern should be effectively the same as the last one -- +enclosing the d in a character class isn't going to change what it +matches. So why does the first not print while the second one does? + +The answer lies in the optimizations the REx engine makes. In the first +case, all the engine sees are plain old characters (aside from the +C<?{}> construct). It's smart enough to realize that the string 'ddd' +doesn't occur in our target string before actually running the pattern +through. But in the second case, we've tricked it into thinking that our +pattern is more complicated than it is. It takes a look, sees our +character class, and decides that it will have to actually run the +pattern to determine whether or not it matches, and in the process of +running it hits the print statement before it discovers that we don't +have a match. + +To take a closer look at how the engine does optimizations, see the +section L<"Pragmas and debugging"> below. + +More fun with C<?{}>: + + $x =~ /(?{print "Hi Mom!";})/; # matches, + # prints 'Hi Mom!' + $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches, + # prints '1' + $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches, + # prints '1' + +The bit of magic mentioned in the section title occurs when the regexp +backtracks in the process of searching for a match. If the regexp +backtracks over a code expression and if the variables used within are +localized using C<local>, the changes in the variables produced by the +code expression are undone! Thus, if we wanted to count how many times +a character got matched inside a group, we could use, e.g., + + $x = "aaaa"; + $count = 0; # initialize 'a' count + $c = "bob"; # test if $c gets clobbered + $x =~ /(?{local $c = 0;}) # initialize count + ( a # match 'a' + (?{local $c = $c + 1;}) # increment count + )* # do this any number of times, + aa # but match 'aa' at the end + (?{$count = $c;}) # copy local $c var into $count + /x; + print "'a' count is $count, \$c variable is '$c'\n"; + +This prints + + 'a' count is 2, $c variable is 'bob' + +If we replace the S<C< (?{local $c = $c + 1;})> > with +S<C< (?{$c = $c + 1;})> >, the variable changes are I<not> undone +during backtracking, and we get + + 'a' count is 4, $c variable is 'bob' + +Note that only localized variable changes are undone. Other side +effects of code expression execution are permanent. Thus + + $x = "aaaa"; + $x =~ /(a(?{print "Yow\n";}))*aa/; + +produces + + Yow + Yow + Yow + Yow + +The result C<$^R> is automatically localized, so that it will behave +properly in the presence of backtracking. + +This example uses a code expression in a conditional to match the +article 'the' in either English or German: + + $lang = 'DE'; # use German + ... + $text = "das"; + print "matched\n" + if $text =~ /(?(?{ + $lang eq 'EN'; # is the language English? + }) + the | # if so, then match 'the' + (die|das|der) # else, match 'die|das|der' + ) + /xi; + +Note that the syntax here is C<(?(?{...})yes-regexp|no-regexp)>, not +C<(?((?{...}))yes-regexp|no-regexp)>. In other words, in the case of a +code expression, we don't need the extra parentheses around the +conditional. + +If you try to use code expressions with interpolating variables, perl +may surprise you: + + $bar = 5; + $pat = '(?{ 1 })'; + /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated + /foo(?{ 1 })$bar/; # compile error! + /foo${pat}bar/; # compile error! + + $pat = qr/(?{ $foo = 1 })/; # precompile code regexp + /foo${pat}bar/; # compiles ok + +If a regexp has (1) code expressions and interpolating variables,or +(2) a variable that interpolates a code expression, perl treats the +regexp as an error. If the code expression is precompiled into a +variable, however, interpolating is ok. The question is, why is this +an error? + +The reason is that variable interpolation and code expressions +together pose a security risk. The combination is dangerous because +many programmers who write search engines often take user input and +plug it directly into a regexp: + + $regexp = <>; # read user-supplied regexp + $chomp $regexp; # get rid of possible newline + $text =~ /$regexp/; # search $text for the $regexp + +If the C<$regexp> variable contains a code expression, the user could +then execute arbitrary Perl code. For instance, some joker could +search for S<C<system('rm -rf *');> > to erase your files. In this +sense, the combination of interpolation and code expressions B<taints> +your regexp. So by default, using both interpolation and code +expressions in the same regexp is not allowed. If you're not +concerned about malicious users, it is possible to bypass this +security check by invoking S<C<use re 'eval'> >: + + use re 'eval'; # throw caution out the door + $bar = 5; + $pat = '(?{ 1 })'; + /foo(?{ 1 })$bar/; # compiles ok + /foo${pat}bar/; # compiles ok + +Another form of code expression is the S<B<pattern code expression> >. +The pattern code expression is like a regular code expression, except +that the result of the code evaluation is treated as a regular +expression and matched immediately. A simple example is + + $length = 5; + $char = 'a'; + $x = 'aaaaabb'; + $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a' + + +This final example contains both ordinary and pattern code +expressions. It detects if a binary string C<1101010010001...> has a +Fibonacci spacing 0,1,1,2,3,5,... of the C<1>'s: + + $s0 = 0; $s1 = 1; # initial conditions + $x = "1101010010001000001"; + print "It is a Fibonacci sequence\n" + if $x =~ /^1 # match an initial '1' + ( + (??{'0' x $s0}) # match $s0 of '0' + 1 # and then a '1' + (?{ + $largest = $s0; # largest seq so far + $s2 = $s1 + $s0; # compute next term + $s0 = $s1; # in Fibonacci sequence + $s1 = $s2; + }) + )+ # repeat as needed + $ # that is all there is + /x; + print "Largest sequence matched was $largest\n"; + +This prints + + It is a Fibonacci sequence + Largest sequence matched was 5 + +Ha! Try that with your garden variety regexp package... + +Note that the variables C<$s0> and C<$s1> are not substituted when the +regexp is compiled, as happens for ordinary variables outside a code +expression. Rather, the code expressions are evaluated when perl +encounters them during the search for a match. + +The regexp without the C<//x> modifier is + + /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/; + +and is a great start on an Obfuscated Perl entry :-) When working with +code and conditional expressions, the extended form of regexps is +almost necessary in creating and debugging regexps. + +=head2 Pragmas and debugging + +Speaking of debugging, there are several pragmas available to control +and debug regexps in Perl. We have already encountered one pragma in +the previous section, S<C<use re 'eval';> >, that allows variable +interpolation and code expressions to coexist in a regexp. The other +pragmas are + + use re 'taint'; + $tainted = <>; + @parts = ($tainted =~ /(\w+)\s+(\w+)/; # @parts is now tainted + +The C<taint> pragma causes any substrings from a match with a tainted +variable to be tainted as well. This is not normally the case, as +regexps are often used to extract the safe bits from a tainted +variable. Use C<taint> when you are not extracting safe bits, but are +performing some other processing. Both C<taint> and C<eval> pragmas +are lexically scoped, which means they are in effect only until +the end of the block enclosing the pragmas. + + use re 'debug'; + /^(.*)$/s; # output debugging info + + use re 'debugcolor'; + /^(.*)$/s; # output debugging info in living color + +The global C<debug> and C<debugcolor> pragmas allow one to get +detailed debugging info about regexp compilation and +execution. C<debugcolor> is the same as debug, except the debugging +information is displayed in color on terminals that can display +termcap color sequences. Here is example output: + + % perl -e 'use re "debug"; "abc" =~ /a*b+c/;' + Compiling REx `a*b+c' + size 9 first at 1 + 1: STAR(4) + 2: EXACT <a>(0) + 4: PLUS(7) + 5: EXACT <b>(0) + 7: EXACT <c>(9) + 9: END(0) + floating `bc' at 0..2147483647 (checking floating) minlen 2 + Guessing start of match, REx `a*b+c' against `abc'... + Found floating substr `bc' at offset 1... + Guessed: match at offset 0 + Matching REx `a*b+c' against `abc' + Setting an EVAL scope, savestack=3 + 0 <> <abc> | 1: STAR + EXACT <a> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 1 <a> <bc> | 4: PLUS + EXACT <b> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 2 <ab> <c> | 7: EXACT <c> + 3 <abc> <> | 9: END + Match successful! + Freeing REx: `a*b+c' + +If you have gotten this far into the tutorial, you can probably guess +what the different parts of the debugging output tell you. The first +part + + Compiling REx `a*b+c' + size 9 first at 1 + 1: STAR(4) + 2: EXACT <a>(0) + 4: PLUS(7) + 5: EXACT <b>(0) + 7: EXACT <c>(9) + 9: END(0) + +describes the compilation stage. C<STAR(4)> means that there is a +starred object, in this case C<'a'>, and if it matches, goto line 4, +i.e., C<PLUS(7)>. The middle lines describe some heuristics and +optimizations performed before a match: + + floating `bc' at 0..2147483647 (checking floating) minlen 2 + Guessing start of match, REx `a*b+c' against `abc'... + Found floating substr `bc' at offset 1... + Guessed: match at offset 0 + +Then the match is executed and the remaining lines describe the +process: + + Matching REx `a*b+c' against `abc' + Setting an EVAL scope, savestack=3 + 0 <> <abc> | 1: STAR + EXACT <a> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 1 <a> <bc> | 4: PLUS + EXACT <b> can match 1 times out of 32767... + Setting an EVAL scope, savestack=3 + 2 <ab> <c> | 7: EXACT <c> + 3 <abc> <> | 9: END + Match successful! + Freeing REx: `a*b+c' + +Each step is of the form S<C<< n <x> <y> >> >, with C<< <x> >> the +part of the string matched and C<< <y> >> the part not yet +matched. The S<C<< | 1: STAR >> > says that perl is at line number 1 +n the compilation list above. See +L<perldebguts/"Debugging regular expressions"> for much more detail. + +An alternative method of debugging regexps is to embed C<print> +statements within the regexp. This provides a blow-by-blow account of +the backtracking in an alternation: + + "that this" =~ m@(?{print "Start at position ", pos, "\n";}) + t(?{print "t1\n";}) + h(?{print "h1\n";}) + i(?{print "i1\n";}) + s(?{print "s1\n";}) + | + t(?{print "t2\n";}) + h(?{print "h2\n";}) + a(?{print "a2\n";}) + t(?{print "t2\n";}) + (?{print "Done at position ", pos, "\n";}) + @x; + +prints + + Start at position 0 + t1 + h1 + t2 + h2 + a2 + t2 + Done at position 4 + +=head1 BUGS + +Code expressions, conditional expressions, and independent expressions +are B<experimental>. Don't use them in production code. Yet. + +=head1 SEE ALSO + +This is just a tutorial. For the full story on perl regular +expressions, see the L<perlre> regular expressions reference page. + +For more information on the matching C<m//> and substitution C<s///> +operators, see L<perlop/"Regexp Quote-Like Operators">. For +information on the C<split> operation, see L<perlfunc/split>. + +For an excellent all-around resource on the care and feeding of +regular expressions, see the book I<Mastering Regular Expressions> by +Jeffrey Friedl (published by O'Reilly, ISBN 1556592-257-3). + +=head1 AUTHOR AND COPYRIGHT + +Copyright (c) 2000 Mark Kvale +All rights reserved. + +This document may be distributed under the same terms as Perl itself. + +=head2 Acknowledgments + +The inspiration for the stop codon DNA example came from the ZIP +code example in chapter 7 of I<Mastering Regular Expressions>. + +The author would like to thank Jeff Pinyan, Andrew Johnson, Peter +Haworth, Ronald J Kimball, and Joe Smith for all their helpful +comments. + +=cut + diff --git a/gnu/usr.bin/perl/pod/perltodo.pod b/gnu/usr.bin/perl/pod/perltodo.pod index f22d4737f81..f38ba88bf36 100644 --- a/gnu/usr.bin/perl/pod/perltodo.pod +++ b/gnu/usr.bin/perl/pod/perltodo.pod @@ -85,7 +85,7 @@ We need regression/sanity tests for suidperl This value may or may not be accurate, but it certainly is eye-catching. For some things perl5 is faster than perl4, but often -the reliability and extensability have come at a cost of speed. The +the reliability and extensibility have come at a cost of speed. The benchmark suite that Gisle released earlier has been hailed as both a fantastic solution and as a source of entirely meaningless figures. Do we need to test "real applications"? Can you do so? Anyone have @@ -111,10 +111,6 @@ problem for free. =head1 Perl Language -=head2 our ($var) - -Declare global variables (lexically or otherwise). - =head2 64-bit Perl Verify complete 64 bit support so that the value of sysseek, or C<-s>, or @@ -161,7 +157,7 @@ Sarathy, I believe, did the work. Here's what he has to say: Yeah, I hope to implement it someday too. The points that were raised in TPC2 were all to do with calling DESTROY() methods, but -I think we can accomodate that by extending bless() to stash +I think we can accommodate that by extending bless() to stash extra information for objects so we track their lifetime accurately for those that want their DESTROY() to be predictable (this will be a speed hit, naturally, and will therefore be optional, naturally. :) @@ -532,14 +528,6 @@ Kurt Starsinic is working on h2ph. mjd has fixed bugs in a2p in the past. a2p apparently doesn't work on nawk and gawk extensions. Graham Barr has an Include module that does h2ph work at runtime. -=head2 POD Converters - -Brad's PodParser code needs to become part of the core, and the Pod::* -and pod2* programs rewritten to use this standard parser. Currently -the converters take different options, some behave in different -fashions, and some are more picky than others in terms of the POD -files they accept. - =head2 pod2html A short-term fix: pod2html generates absolute HTML links. Make it @@ -863,7 +851,7 @@ See Time::HiRes. =head2 autocroak? -This is the Fatal.pm module, so any builtin that that does +This is the Fatal.pm module, so any builtin that does not return success automatically die()s. If you're feeling brave, tie this in with the unified exceptions scheme. diff --git a/gnu/usr.bin/perl/pod/perlunicode.pod b/gnu/usr.bin/perl/pod/perlunicode.pod index 5333ac495c0..5b0fe2faaf2 100644 --- a/gnu/usr.bin/perl/pod/perlunicode.pod +++ b/gnu/usr.bin/perl/pod/perlunicode.pod @@ -1,16 +1,18 @@ =head1 NAME -perlunicode - Unicode support in Perl +perlunicode - Unicode support in Perl (EXPERIMENTAL, subject to change) =head1 DESCRIPTION =head2 Important Caveat -WARNING: The implementation of Unicode support in Perl is incomplete. + WARNING: As of the 5.6.1 release, the implementation of Unicode + support in Perl is incomplete, and continues to be highly experimental. -The following areas need further work. +The following areas need further work. They are being rapidly addressed +in the 5.7.x development branch. -=over +=over 4 =item Input and Output Disciplines @@ -114,13 +116,7 @@ will typically occur directly within the literal strings as UTF-8 characters, but you can also specify a particular character with an extension of the C<\x> notation. UTF-8 characters are specified by putting the hexadecimal code within curlies after the C<\x>. For instance, -a Unicode smiley face is C<\x{263A}>. A character in the Latin-1 range -(128..255) should be written C<\x{ab}> rather than C<\xab>, since the -former will turn into a two-byte UTF-8 code, while the latter will -continue to be interpreted as generating a 8-bit byte rather than a -character. In fact, if the C<use warnings> pragma of the C<-w> switch -is turned on, it will produce a warning -that you might be generating invalid UTF-8. +a Unicode smiley face is C<\x{263A}>. =item * @@ -163,20 +159,10 @@ C<(?:\PM\pM*)>. =item * -The C<tr///> operator translates characters instead of bytes. It can also -be forced to translate between 8-bit codes and UTF-8. For instance, if you -know your input in Latin-1, you can say: - - while (<>) { - tr/\0-\xff//CU; # latin1 char to utf8 - ... - } - -Similarly you could translate your output with - - tr/\0-\x{ff}//UC; # utf8 to latin1 char - -No, C<s///> doesn't take /U or /C (yet?). +The C<tr///> operator translates characters instead of bytes. Note +that the C<tr///CU> functionality has been removed, as the interface +was a mistake. For similar functionality see pack('U0', ...) and +pack('C0', ...). =item * @@ -214,6 +200,18 @@ byte-oriented C<chr()> and C<ord()> under utf8. =item * +The bit string operators C<& | ^ ~> can operate on character data. +However, for backward compatibility reasons (bit string operations +when the characters all are less than 256 in ordinal value) one cannot +mix C<~> (the bit complement) and characters both less than 256 and +equal or greater than 256. Most importantly, the DeMorgan's laws +(C<~($x|$y) eq ~$x&~$y>, C<~($x&$y) eq ~$x|~$y>) won't hold. +Another way to look at this is that the complement cannot return +B<both> the 8-bit (byte) wide bit complement, and the full character +wide bit complement. + +=item * + And finally, C<scalar reverse()> reverses by character rather than by byte. =back diff --git a/gnu/usr.bin/perl/pod/perlutil.pod b/gnu/usr.bin/perl/pod/perlutil.pod new file mode 100644 index 00000000000..be7a345f796 --- /dev/null +++ b/gnu/usr.bin/perl/pod/perlutil.pod @@ -0,0 +1,185 @@ +=head1 NAME + +perlutil - utilities packaged with the Perl distribution + +=head1 DESCRIPTION + +Along with the Perl interpreter itself, the Perl distribution installs a +range of utilities on your system. There are also several utilities +which are used by the Perl distribution itself as part of the install +process. This document exists to list all of these utilities, explain +what they are for and provide pointers to each module's documentation, +if appropriate. + +=head2 DOCUMENTATION + +=over 3 + +=item L<perldoc|perldoc> + +The main interface to Perl's documentation is C<perldoc>, although +if you're reading this, it's more than likely that you've already found +it. F<perldoc> will extract and format the documentation from any file +in the current directory, any Perl module installed on the system, or +any of the standard documentation pages, such as this one. Use +C<perldoc E<lt>nameE<gt>> to get information on any of the utilities +described in this document. + +=item L<pod2man|pod2man> and L<pod2text|pod2text> + +If it's run from a terminal, F<perldoc> will usually call F<pod2man> to +translate POD (Plain Old Documentation - see L<perlpod> for an +explanation) into a man page, and then run F<man> to display it; if +F<man> isn't available, F<pod2text> will be used instead and the output +piped through your favourite pager. + +=item L<pod2html|pod2html> and L<pod2latex|pod2latex> + +As well as these two, there are two other converters: F<pod2html> will +produce HTML pages from POD, and F<pod2latex>, which produces LaTeX +files. + +=item L<pod2usage|pod2usage> + +If you just want to know how to use the utilities described here, +F<pod2usage> will just extract the "USAGE" section; some of +the utilities will automatically call F<pod2usage> on themselves when +you call them with C<-help>. + +=item L<podselect|podselect> + +F<pod2usage> is a special case of F<podselect>, a utility to extract +named sections from documents written in POD. For instance, while +utilities have "USAGE" sections, Perl modules usually have "SYNOPSIS" +sections: C<podselect -s "SYNOPSIS" ...> will extract this section for +a given file. + +=item L<podchecker|podchecker> + +If you're writing your own documentation in POD, the F<podchecker> +utility will look for errors in your markup. + +=item L<splain|splain> + +F<splain> is an interface to L<perldiag> - paste in your error message +to it, and it'll explain it for you. + +=item L<roffitall|roffitall> + +The C<roffitall> utility is not installed on your system but lives in +the F<pod/> directory of your Perl source kit; it converts all the +documentation from the distribution to F<*roff> format, and produces a +typeset PostScript or text file of the whole lot. + +=back + +=head2 CONVERTORS + +To help you convert legacy programs to Perl, we've included three +conversion filters: + +=over 3 + +=item L<a2p|a2p> + +F<a2p> converts F<awk> scripts to Perl programs; for example, C<a2p -F:> +on the simple F<awk> script C<{print $2}> will produce a Perl program +based around this code: + + while (<>) { + ($Fld1,$Fld2) = split(/[:\n]/, $_, 9999); + print $Fld2; + } + +=item L<s2p|s2p> + +Similarly, F<s2p> converts F<sed> scripts to Perl programs. F<s2p> run +on C<s/foo/bar> will produce a Perl program based around this: + + while (<>) { + chomp; + s/foo/bar/g; + print if $printit; + } + +=item L<find2perl|find2perl> + +Finally, F<find2perl> translates C<find> commands to Perl equivalents which +use the L<File::Find|File::Find> module. As an example, +C<find2perl . -user root -perm 4000 -print> produces the following callback +subroutine for C<File::Find>: + + sub wanted { + my ($dev,$ino,$mode,$nlink,$uid,$gid); + (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && + $uid == $uid{'root'}) && + (($mode & 0777) == 04000); + print("$name\n"); + } + +=back + +As well as these filters for converting other languages, the +L<pl2pm|pl2pm> utility will help you convert old-style Perl 4 libraries to +new-style Perl5 modules. + +=head2 Development + +There are a set of utilities which help you in developing Perl programs, +and in particular, extending Perl with C. + +=over 3 + +=item L<perlbug|perlbug> + +F<perlbug> is the recommended way to report bugs in the perl interpreter +itself or any of the standard library modules back to the developers; +please read through the documentation for F<perlbug> thoroughly before +using it to submit a bug report. + +=item L<h2ph|h2ph> + +Back before Perl had the XS system for connecting with C libraries, +programmers used to get library constants by reading through the C +header files. You may still see C<require 'syscall.ph'> or similar +around - the F<.ph> file should be created by running F<h2ph> on the +corresponding F<.h> file. See the F<h2ph> documentation for more on how +to convert a whole bunch of header files at ones. + +=item L<c2ph|c2ph> and L<pstruct|pstruct> + +F<c2ph> and F<pstruct>, which are actually the same program but behave +differently depending on how they are called, provide another way of +getting at C with Perl - they'll convert C structures and union declarations +to Perl code. This is deprecated in favour of F<h2xs> these days. + +=item L<h2xs|h2xs> + +F<h2xs> converts C header files into XS modules, and will try and write +as much glue between C libraries and Perl modules as it can. It's also +very useful for creating skeletons of pure Perl modules. + +=item L<dprofpp|dprofpp> + +Perl comes with a profiler, the F<Devel::Dprof> module. The +F<dprofpp> utility analyzes the output of this profiler and tells you +which subroutines are taking up the most run time. See L<Devel::Dprof> +for more information. + +=item L<perlcc|perlcc> + +F<perlcc> is the interface to the experimental Perl compiler suite. + +=back + +=head2 SEE ALSO + +L<perldoc|perldoc>, L<pod2man|pod2man>, L<perlpod>, +L<pod2html|pod2html>, L<pod2usage|pod2usage>, L<podselect|podselect>, +L<podchecker|podchecker>, L<splain|splain>, L<perldiag>, +L<roffitall|roffitall>, L<a2p|a2p>, L<s2p|s2p>, L<find2perl|find2perl>, +L<File::Find|File::Find>, L<pl2pm|pl2pm>, L<perlbug|perlbug>, +L<h2ph|h2ph>, L<c2ph|c2ph>, L<h2xs|h2xs>, L<dprofpp|dprofpp>, +L<Devel::Dprof>, L<perlcc|perlcc> + +=cut diff --git a/gnu/usr.bin/perl/pod/podchecker.PL b/gnu/usr.bin/perl/pod/podchecker.PL index a7f96434ca6..20d5e94c2e0 100644 --- a/gnu/usr.bin/perl/pod/podchecker.PL +++ b/gnu/usr.bin/perl/pod/podchecker.PL @@ -39,7 +39,7 @@ print OUT <<'!NO!SUBS!'; ############################################################################# # podchecker -- command to invoke the podchecker function in Pod::Checker # -# Copyright (c) 1998-1999 by Bradford Appleton. All rights reserved. +# Copyright (c) 1998-2000 by Bradford Appleton. All rights reserved. # This file is part of "PodParser". PodParser is free software; # you can redistribute it and/or modify it under the same terms # as Perl itself. @@ -70,7 +70,9 @@ Print the manual page and exit. =item B<-warnings> B<-nowarnings> -Turn on/off printing of warnings. +Turn on/off printing of warnings. Repeating B<-warnings> increases the +warning level, i.e. more warnings are printed. Currently increasing to +level two causes flagging of unescaped "E<lt>,E<gt>" characters. =item I<file> @@ -85,6 +87,8 @@ syntax errors in the POD documentation and will print any errors it find to STDERR. At the end, it will print a status message indicating the number of errors found. +Directories are ignored, an appropriate warning message is printed. + B<podchecker> invokes the B<podchecker()> function exported by B<Pod::Checker> Please see L<Pod::Checker/podchecker()> for more details. @@ -124,24 +128,34 @@ use Pod::Usage; use Getopt::Long; ## Define options -my %options = ( - "help" => 0, - "man" => 0, - "warnings" => 1, -); +my %options; ## Parse options -GetOptions(\%options, "help", "man", "warnings!") || pod2usage(2); +GetOptions(\%options, qw(help man warnings+ nowarnings)) || pod2usage(2); pod2usage(1) if ($options{help}); pod2usage(-verbose => 2) if ($options{man}); +if($options{nowarnings}) { + $options{warnings} = 0; +} +elsif(!defined $options{warnings}) { + $options{warnings} = 1; # default is warnings on +} + ## Dont default to STDIN if connected to a terminal pod2usage(2) if ((@ARGV == 0) && (-t STDIN)); ## Invoke podchecker() my $status = 0; -@ARGV = ("<&STDIN") unless(@ARGV); +@ARGV = qw(-) unless(@ARGV); for (@ARGV) { + if($_ eq '-') { + $_ = "<&STDIN"; + } + elsif(-d) { + warn "podchecker: Warning: Ignoring directory '$_'\n"; + next; + } my $s = podchecker($_, undef, '-warnings' => $options{warnings}); if($s > 0) { # errors occurred diff --git a/gnu/usr.bin/perl/pod/podselect.PL b/gnu/usr.bin/perl/pod/podselect.PL index f2ba80a73b5..b6b8c9b9e43 100644 --- a/gnu/usr.bin/perl/pod/podselect.PL +++ b/gnu/usr.bin/perl/pod/podselect.PL @@ -39,7 +39,7 @@ print OUT <<'!NO!SUBS!'; ############################################################################# # podselect -- command to invoke the podselect function in Pod::Select # -# Copyright (c) 1996-1999 by Bradford Appleton. All rights reserved. +# Copyright (c) 1996-2000 by Bradford Appleton. All rights reserved. # This file is part of "PodParser". PodParser is free software; # you can redistribute it and/or modify it under the same terms # as Perl itself. |