diff options
Diffstat (limited to 'gnu/usr.bin/perl/regen/HeaderParser.pm')
-rw-r--r-- | gnu/usr.bin/perl/regen/HeaderParser.pm | 1839 |
1 files changed, 1839 insertions, 0 deletions
diff --git a/gnu/usr.bin/perl/regen/HeaderParser.pm b/gnu/usr.bin/perl/regen/HeaderParser.pm new file mode 100644 index 00000000000..cc38f9ea9dc --- /dev/null +++ b/gnu/usr.bin/perl/regen/HeaderParser.pm @@ -0,0 +1,1839 @@ +package HeaderParser; +use strict; +use warnings; + +# these are required below in BEGIN statements, we cant have a +# hard dependency on them as they might not be available when +# we run as part of autodoc.pl +# +# use Data::Dumper; +# use Storable qw(dclone); +# +use Carp qw(confess); +use Text::Tabs qw(expand unexpand); +use Text::Wrap qw(wrap); + +# The style of this file is determined by: +# +# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \ +# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \ +# -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2 + +my ( + %unop, # unary operators and their precedence + %binop, # binary operators and their precedence + %is_right_assoc, # operators which are right associative + %precedence, # precedence of all operators. + %associative, # associative operators + %commutative, # commutative operators + %cmpop, # comparison operators + $unop_pat, # pattern to match unary operators + $binop_pat, # pattern to match binary operators + %op_names, # map of op to description, used in error messages + $tokenize_pat # a pattern which can tokenize an expression +); + +BEGIN { + # this is initialization for the operator precedence expression parser + # we use for handling preprocessor conditions. + %op_names= ( + '==' => 'equality', + '!=' => 'inequality', + '<<' => 'bit-shift-left', + '>>' => 'bit-shift-right', + '+' => 'addition', + '-' => 'subtraction', + '*' => 'multiplication', + '/' => 'division', + '%' => 'modulo', + '||' => 'logical-or', # Lowest precedence + '&&' => 'logical-and', + '|' => 'binary-or', + '^' => 'binary-xor', + '&' => 'binary-and', + '<' => 'less-than', # split on spaces, all with equal precedence + '>' => 'greater-than', + '<=' => 'less-than-or-equal', + '>=' => 'greater-than-or-equal', + ); + my @cmpop= ( + '== !=', # listed in lowest to highest precedence + '< > <= >=', # split on spaces, all with equal precedence + ); + my @binop= ( + '||', # Lowest precedence + '&&', + '|', + '^', + '&', + @cmpop, # include the numerical comparison operators. + '<< >>', + '+ -', + '* / %', # highest prcedence operators. + ); + + my @unop= qw( ! ~ + - ); + %unop= map { $_ => 1 } @unop; + %cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop; + %binop= map { $_ => 1 } map { split /\s+/, $_ } @binop; + + my $make_pat= sub { + my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b } + map quotemeta($_), @_; + return qr/$pat/; + }; + $unop_pat= $make_pat->(@unop); + foreach my $ix (0 .. $#binop) { + my $sym= $binop[$ix]; + $precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym; + } + $is_right_assoc{"?"}= 1; + $is_right_assoc{":"}= 1; + $precedence{"?"}= 1; + $precedence{":"}= 0; + + $associative{$_}++ + for qw( || && + *); # we leave '==' out so we don't reorder terms + $commutative{$_}++ for qw( || && + *); + + $binop_pat= $make_pat->(keys %precedence); + $tokenize_pat= qr/ + ^(?: + (?<comment> \/\*.*?\*\/ ) + | (?<ws> \s+ ) + | (?<term> + (?<literal> + (?<define> defined\(\w+\) ) + | (?<func> \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) ) + | (?<const> (?:0x[a-fA-F0-9]+|\d+[LU]*|'.') ) + | (?<sym> \w+ ) + ) + | (?<op> $binop_pat | $unop_pat ) + | (?<paren> [\(\)] ) + ) + ) + /xs; +} + +# dump the arguments with dump. wraps loading Dumper +# as we are executed by miniperl where Dumper isnt available +sub dd { + my $self= shift; + local $self->{orig_content}; + my $ret= "(dump not available)"; + eval { + require Data::Dumper; + $ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump(); + }; + return $ret; +} + +my $has_storable; + +# same story here, in miniperl we use slow perl code, +# in real perl we can use Storable and speed things up. +BEGIN { eval "use Storable; \$has_storable=1;" } + +# recursively copy an AoAoA... +sub copy_aoa { + my ($aoa)= @_; + if ($has_storable) { + return Storable::dclone($aoa); + } + else { + return _copy_aoa($aoa); + } +} + +sub _copy_aoa { + my ($thing)= @_; + if (ref $thing) { + return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ]; + } + else { + return $thing; + } +} + +# return the number characters that should go in between a '#' and +# the name of a c preprocessor directive. Returns 0 spaces for level +# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc) +# This might sound weird, but consider these are tab *stops* and the +# '#' is included in the total. which means indents of 2, 4, 6 etc. +sub indent_chars { + my ($self, $level)= @_; + my $ind= ""; + $ind .= " " if $level; + $ind .= " " x ($level - 1) if $level > 1; + return $ind; +} + +# we use OO to store state, etc. +sub new { + my ($class, %args)= @_; + $args{add_commented_expr_after} //= 10; + $args{max_width} //= 78; + $args{min_break_width} //= 70; + return bless \%args,; +} + +# this parses the expression into an array of tokens +# this is somewhat crude, we could do this incrementally +# if we wanted and avoid the overhead. but it makes it +# easier to debug the tokenizer. +sub _tokenize_expr { + my ($self, $expr)= @_; + delete $self->{tokens}; + delete $self->{parse_tree}; + $self->{original_expr}= $expr; + + my @tokens; + while ($expr =~ s/$tokenize_pat//xs) { + push @tokens, {%+} if defined $+{'term'}; + } + $self->{tokens}= \@tokens; + warn $self->dd($self) if $self->{debug}; + if (length $expr) { + confess "Failed to tokenize_expr: $expr\n"; + } + return \@tokens; +} + +sub _count_ops { + my ($self, $term)= @_; + my $count= 0; + $count++ while $term =~ m/(?: \|\| | \&\& | \? )/gx; + return $count; +} + +# sort terms in an expression in a way that puts things +# in a sensible order. Anything starting with PERL_IN_ +# should be on the left in alphabetical order. Digits +# should be on the right (eg 0), and ties are resolved +# by stripping non-alpha-numerc, thus removing underbar +# parens, spaces, logical operators, etc, and then by +# lc comparison of the result. +sub _sort_terms { + my $self= shift; + my (@terms)= map { + [ + $_, # 0: raw + lc($_) =~ s/[^a-zA-Z0-9]//gr, # 1: "_" stripped and caseless + $_ =~ m/PERL_IN_/ ? 1 : 0, # 2: PERL_IN_ labeled define + $_ =~ m/^\d/ ? 1 : 0, # 3: digit + $_ =~ m/DEBUGGING/ ? 1 : 0, # 4: DEBUGGING? + $self->_count_ops($_), # 5: Number of ops (||, &&) + ] + } @_; + my %seen; + #start-no-tidy + @terms= map { $seen{ $_->[0] }++ ? () : $_->[0] } + sort { + $a->[5] <=> $b->[5] || # least number of ops + $b->[2] <=> $a->[2] || # PERL_IN before others + $a->[3] <=> $b->[3] || # digits after others + $a->[4] <=> $b->[4] || # DEBUGGING after all else + $a->[1] cmp $b->[1] || # stripped caseless cmp + lc($a->[0]) cmp lc($b->[0]) || # caseless cmp + $a->[0] cmp $b->[0] || # exact cmp + 0 + } @terms; + #end-no-tidy + return @terms; +} + +# normalize a condition expression by parsing it and then stringifying +# the parse tree. +sub tidy_cond { + my ($self, $expr)= @_; + my $ret= $self->{_tidy_cond_cache}{$expr} //= do { + $self->parse_expr($expr) if defined $expr; + my $text= $self->_pt_as_str(); + $text; + }; + $self->{last_expr}= $ret; + return $ret; +} + +# convert a parse tree structure to a string recursively. +# +# Parse trees are currently made up of arrays, with the count +# of items in the object determining the type of op it represents. +# 1 argument: literal value of some sort. +# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree +# : ternary: 0 slot holds '?', 1 is an array holding three +# parse trees: cond, true, false +# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees +# : note, this is multigate for commutative operators like +# : "+", "*", "&&" and "||", so an expr +# : like "A && B && !C" would be represented as: +# : [ "&&", ["A"], ["B"], [ "!",["C"] ] ] +# +sub _pt_as_str { + my ($self, $node, $parent_op, $depth)= @_; + + $node ||= $self->{parse_tree} + or confess "No parse tree?"; + $depth ||= 0; + if (@$node == 1) { + + # its a literal + return $node->[0]; + } + elsif (@$node == 2) { + + # is this a ternary or an unop? + if ($node->[0] eq '?') { + + # ternary, the three "parts" are tucked away in + # an array in the payload slot + my $expr= + $self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? " + . $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : " + . $self->_pt_as_str($node->[1][2], "?", $depth + 1); + + # stick parens on if this is a subexpression + $expr= "( " . $expr . " )" if $depth; + return $expr; + } + else { + if ( $node->[0] eq "!" + and @{ $node->[1] } == 2 + and $node->[1][0] eq "!") + { + # normalize away !! in expressions. + return $self->_pt_as_str($node->[1][1], $parent_op, $depth); + } + + # unop - the payload is a optree + return $node->[0] + . $self->_pt_as_str($node->[1], $node->[0], $depth + 1); + } + } + + # if we get here we are dealing with a binary operator + # the nodes are not necessarily binary, as we "collect" + # the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D] + my ($op, @terms)= @$node; + + # convert the terms to strings + @terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms; + + # sort them to normalize the subexpression + my $expr= + join " $op ", $associative{$op} + ? $self->_sort_terms(@terms) + : @terms; + + # stick parens on if this is a subexpression + $expr= "( " . $expr . " )" if $depth and !$cmpop{$op}; + + # and we are done. + return $expr; +} + +# Returns the precedence of an operator, returns 0 if there is no token +# or the next token is not an op, or confesss if it encounters an op it does not +# know. +sub _precedence { + my $self= shift; + my $token= shift // return 0; + + my $op= (ref $token ? $token->{op} : $token) // return 0; + + return $precedence{$op} // confess "Unknown op '$op'"; +} + +# entry point into parsing the tokens, checks that we actually parsed everything +# and didnt leave anything in the token stream (possible from a malformed expression) +# Performs some minor textual cleanups using regexes, but then does a proper parse +# of the expression. +sub parse_expr { + my ($self, $expr)= @_; + if (defined $expr) { + $expr =~ s/\s*\\\n\s*/ /g; + $expr =~ s/defined\s+(\w+)/defined($1)/g; + $self->_tokenize_expr($expr); + } + my $ret= $self->_parse_expr(); + if (@{ $self->{tokens} }) { + + # if all was well with parsing we should not get here. + confess "Unparsed tokens: ", $self->dd($self->{tokens}); + } + $self->{parse_tree}= $ret; + return $ret; +} + +# this is just a wrapper around _parse_expr_assoc() which handles +# parsing an arbitrary expression. +sub _parse_expr { + my ($self)= @_; + return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1); +} + +# This handles extracting from the token stream +# - simple literals +# - unops (assumed to be right associative) +# - parens (which reset the precedence acceptable to the parser) +# +sub _parse_expr_primary { + my ($self)= @_; + my $tokens= $self->{tokens} + or confess "No tokens in _parse_expr_primary?"; + my $first= $tokens->[0] + or confess "No primary?"; + if ($first->{paren} and $first->{paren} eq "(") { + shift @$tokens; + my $expr= $self->_parse_expr(); + $first= $tokens->[0]; + if (!$first->{paren} or $first->{paren} ne ")") { + confess "Expecting close paren", $self->dd($tokens); + } + shift @$tokens; + return $expr; + } + elsif ($first->{op} and $unop{ $first->{op} }) { + my $op_token= shift @$tokens; + return [ $op_token->{op}, $self->_parse_expr_primary() ]; + } + elsif (defined $first->{literal}) { + shift @$tokens; + return [ $first->{literal} ]; + } + else { + die sprintf + "Unexpected token '%s', expecting literal, unary, or expression.\n", + $first->{term}; + } +} + +# This is the heart of the expression parser. It uses +# a pair of nested loops to avoid excessive recursion during parsing, +# which should be a bit faster than other strategies. It only should +# recurse when the precedence level changes. +sub _parse_expr_assoc { + my ($self, $lhs, $min_precedence)= @_; + my $tokens= $self->{tokens} + or confess "No tokens in _parse_expr_assoc"; + my $la= $tokens->[0]; # lookahead + my $la_pr= $self->_precedence($la); # lookahead precedence + while ($la && $la_pr >= $min_precedence) { + my $op_token= shift @$tokens; + my $op_pr= $la_pr; # op precedence + if ($op_token->{op} eq "?") { + my $mid= $self->_parse_expr(); + if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") { + shift @$tokens; + my $tail= $self->_parse_expr(); + return [ '?', [ $lhs, $mid, $tail ] ]; + } + confess "Panic: expecting ':'", $self->dd($tokens); + } + my $rhs; + eval { $rhs= $self->_parse_expr_primary(); } + or die "Error in $op_names{$op_token->{op}} expression: $@"; + $la= $tokens->[0]; + $la_pr= $self->_precedence($la); + while ( + $la_pr > $op_pr || # any and larger + ( $is_right_assoc{ $op_token->{op} } + and $la_pr == $op_pr) # right and equal + ) { + my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0); + $rhs= $self->_parse_expr_assoc($rhs, $new_precedence); + $la= $tokens->[0]; + $la_pr= $self->_precedence($la); + } + if ( @$lhs >= 3 + && $lhs->[0] eq $op_token->{op} + && $commutative{ $op_token->{op} }) + { + push @$lhs, $rhs; + } + else { + my @lt= ($lhs); + my @rt= ($rhs); + + # if we have '( a && b ) && ( c && d)' + # turn it into 'a && b && c && d' + if (@$lhs > 2 && $lhs->[0] eq $op_token->{op}) { + (undef,@lt)= @$lhs; # throw away op. + } + if (@$rhs > 2 && $rhs->[0] eq $op_token->{op}) { + (undef,@rt)= @$rhs; # throw away op. + } + $lhs= [ $op_token->{op}, @lt, @rt ]; + } + } + return $lhs; +} + +#entry point for normalizing and if/elif statements +#returns the line and condition in normalized form. +sub normalize_if_elif { + my ($self, $line, $line_info)= @_; + if (my $dat= $self->{cache_normalize_if_elif}{$line}) { + return $dat->{line}, $dat->{cond}; + } + my ($cond); + eval { + ($line, $cond)= $self->_normalize_if_elif($line); + 1; + } or die sprintf "Error at line %d\nLine %d: %s\n%s", + ($line_info->start_line_num()) x 2, $line, $@; + $self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond }; + return ($line, $cond); +} + +#guts of the normalize_if_elif() - cleans up the line, extracts +#the condition, and then tidies it with tidy_cond(). +sub _normalize_if_elif { + my ($self, $line)= @_; + my $nl= ""; + $nl= $1 if $line =~ s/(\n+)\z//; + $line =~ s/\s+\z//; + my @comment; + push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!; + $line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g; + $line =~ s/!\s+defined/!defined/g; + + if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) { + my $if= $1; + my $not= $2 ? "!" : ""; + $line= "#$if ${not}defined($3)"; + } + $line =~ s/#((?:el)?if)\s+// + or confess "Bad cond: $line"; + my $if= $1; + $line =~ s/!\s+/!/g; + + my $old_cond= $line; + my $cond= $self->tidy_cond($old_cond); + + warn "cond - $old_cond\ncond + $cond\n" + if $old_cond ne $cond and $self->{debug}; + + $line= "#$if $cond"; + $line .= " " . join " ", reverse @comment if @comment; + + $line .= $nl; + return ($line, $cond); +} + +# parses a text buffer as though it was a file on disk +# calls parse_fh() +sub parse_text { + my ($self, $text)= @_; + local $self->{parse_source}= "(buffer)"; + open my $fh, "<", \$text + or die "Failed to open buffer for read: $!"; + return $self->parse_fh($fh); +} + +# takes a readable filehandle and parses whatever contents is +# returned by reading it. Returns an array of HeaderLine objects. +# this is the main routing for parsing a header file. +sub parse_fh { + my ($self, $fh)= @_; + my @lines; + my @cond; + my @cond_line; + my $last_cond; + local $self->{parse_source}= $self->{parse_source} || "(unknown)"; + my $cb= $self->{pre_process_content}; + $self->{orig_content}= ""; + my $line_num= 1; + + while (defined(my $line= readline($fh))) { + my $start_line_num= $line_num++; + $self->{orig_content} .= $line; + while ($line =~ /\\\n\z/ or $line =~ m</\*(?:(?!\*/).)*\s*\z>s) { + defined(my $read_line= readline($fh)) + or last; + $self->{orig_content} .= $read_line; + $line_num++; + $line .= $read_line; + } + while ($line =~ m!/\*(.*?)(\*/|\z)!gs) { + my ($inner, $tail)= ($1, $2); + if ($tail ne "*/") { + confess + "Unterminated comment starting at line $start_line_num\n"; + } + elsif ($inner =~ m!/\*!) { + confess + "Nested/broken comment starting at line $start_line_num\n"; + } + } + + my $raw= $line; + my $type= "content"; + my $sub_type= "text"; + my $level= @cond; + my $do_pop= 0; + my $flat= $line; + $flat =~ s/\s*\\\n\s*/ /g; + $flat =~ s!/\*.*?\*/! !gs; + $flat =~ s/\s+/ /g; + $flat =~ s/\s+\z//; + $flat =~ s/^\s*#\s*/#/g; + + my $line_info= + HeaderLine->new(start_line_num => $start_line_num, raw => $raw); + my $do_cond_line; + if ($flat =~ /^#/) { + if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) { + my $if= $1; + my $not= $2 ? "!" : ""; + my $sym= $3; + $flat =~ + s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/; + } + my $cond; # used in various expressions below + if ($flat =~ /^#endif/) { + if (!@cond) { + confess "Not expecting $flat"; + } + $do_pop= 1; + $level--; + $type= "cond"; + $sub_type= "#endif"; + } + elsif ($flat =~ /^#if\b/) { + ($flat, $cond)= $self->normalize_if_elif($flat, $line_info); + push @cond, [$cond]; + push @cond_line, $line_info; + $type= "cond"; + $sub_type= "#if"; + } + elsif ($flat =~ /^#elif\b/) { + if (!@cond) { + confess "No if for $flat"; + } + $level--; + ($flat, $cond)= $self->normalize_if_elif($flat, $line_info); + $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])"); + $cond_line[-1]= $line_info; + push @{ $cond[-1] }, $cond; + $type= "cond"; + $sub_type= "#elif"; + } + elsif ($flat =~ /^#else\b/) { + if (!@cond) { + confess "No if for $flat"; + } + $level--; + $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])"); + $cond_line[-1]= $line_info; + $type= "cond"; + $sub_type= "#else"; + } + elsif ($flat =~ /#undef/) { + $type= "content"; + $sub_type= "#undef"; + } + elsif ($flat =~ /#pragma\b/) { + $type= "content"; + $sub_type= "#pragma"; + } + elsif ($flat =~ /#include\b/) { + $type= "content"; + $sub_type= "#include"; + } + elsif ($flat =~ /#define\b/) { + $type= "content"; + $sub_type= "#define"; + } + elsif ($flat =~ /#error\b/) { + $type= "content"; + $sub_type= "#error"; + } + else { + confess "Do not know what to do with $line"; + } + if ($type eq "cond") { + + # normalize conditional lines + $line= $flat; + $last_cond= $line_info; + } + } + $line =~ s/\n?\z/\n/; + + %$line_info= ( + cond => copy_aoa(\@cond), + type => $type, + sub_type => $sub_type, + raw => $raw, + flat => $flat, + line => $line, + level => $level, + source => $self->{parse_source}, + start_line_num => $start_line_num, + n_lines => $line_num - $start_line_num, + ); + + push @lines, $line_info; + if ($do_pop) { + $line_info->{inner_lines}= + $line_info->start_line_num - $cond_line[-1]->start_line_num; + pop @cond; + pop @cond_line; + } + if ($type eq "content" and $cb) { + $cb->($self, $lines[-1]); + } + } + if (@cond_line) { + my $msg= "Unterminated conditional block starting line " + . $cond_line[-1]->start_line_num(); + $msg .= + " with last conditional operation at line " + . $last_cond->start_line_num() + if $cond_line[-1] != $last_cond; + confess $msg; + } + $self->{lines}= \@lines; + return \@lines; +} + +# returns the last lines we parsed. +sub lines { $_[0]->{lines} } + +# assuming a line looks like an embed.fnc entry parse it +# and normalize it, and create and EmbedLine object from it. +sub tidy_embed_fnc_entry { + my ($self, $line_data)= @_; + my $line= $line_data->{line}; + return $line if $line =~ /^\s*:/; + return $line unless $line_data->{type} eq "content"; + return $line unless $line =~ /\|/; + + $line =~ s/\s*\\\n/ /g; + $line =~ s/\s+\z//; + ($line)= expand($line); + my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line; + my %flag_seen; + $flags= join "", grep !$flag_seen{$_}++, sort split //, $flags; + if ($flags =~ s/^#//) { + $flags .= "#"; + } + if ($flags eq "#") { + die "Not allowed to use only '#' for flags" + . "in 'embed.fnc' at line $line_data->{start_line_num}"; + } + if (!$flags) { + die "Missing flags in function definition" + . " in 'embed.fnc' at line $line_data->{start_line_num}\n" + . "Did you a forget a line continuation on the previous line?\n"; + } + for ($ret, @args) { + s/(\w)\*/$1 */g; + s/\*\s+(\w)/*$1/g; + s/\*const/* const/g; + } + my $head= sprintf "%-8s|%-7s", $flags, $ret; + $head .= sprintf "|%*s", -(31 - length($head)), $name; + if (@args and length($head) > 32) { + $head .= "\\\n"; + $head .= " " x 32; + } + foreach my $ix (0 .. $#args) { + my $arg= $args[$ix]; + $head .= "|$arg"; + $head .= "\\\n" . (" " x 32) if $ix < $#args; + } + $line= $head . "\n"; + + if ($line =~ /\\\n/) { + my @lines= split /\s*\\\n/, $line; + my $len= length($lines[0]); + $len < length($_) and $len= length($_) for @lines; + $len= int(($len + 7) / 8) * 8; + $len= 72 if $len < 72; + $line= join("\\\n", + (map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]), + $lines[-1]); + } + ($line)= unexpand($line); + + $line_data->{embed}= EmbedLine->new( + flags => $flags, + return_type => $ret, + name => $name, + args => \@args, + ); + $line =~ s/\s+\z/\n/; + $line_data->{line}= $line; + return $line; +} + +# line up the text in a multiline string by a given $fragment +# of text, inserting whitespace in front or behind the $fragment +# to get the text to line up. Returns the text. This is wrapped +# by line_up() and is used to wrap long conditions and comments +# in the generated code. +sub _line_up_frag { + my ($self, $str, $fragment)= @_; + die "has tabs?!" if $str =~ /\t/; + my @lines= split /\n/, $str; + my $changed= 1; + while ($changed) { + $changed= 0; + foreach my $ix (0 .. $#lines - 1) { + my $f_index= 0; + my $n_index= 0; + while (1) { + $f_index= index($lines[$ix], $fragment, $f_index); + $n_index= index($lines[ $ix + 1 ], $fragment, $n_index); + if ($f_index == -1 or $n_index == -1) { + last; + } + if ($f_index < $n_index) { + my $f_idx= $f_index; + $f_idx-- while substr($lines[$ix], $f_idx, 1) ne " "; + substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index)); + $changed++; + last; + } + elsif ($n_index < $f_index) { + my $n_idx= $n_index; + $n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " "; + substr($lines[ $ix + 1 ], + $n_idx, 0, " " x ($f_index - $n_index)); + $changed++; + last; + } + $f_index++; + $n_index++; + } + } + } + my $ret= join "", map { "$_\n" } @lines; + return $ret; +} + +sub _fixup_indent { + my ($self, $line)= @_; + my @lines= split /\n/, $line; + if ($lines[0]=~/^(#\s*\w+(?:\s*\/\*)?\s)(\s+)/) { + my $first_left_len = length $1; + + while (1) { + my $ok = 1; + for (@lines) { + /^.{$first_left_len} / + or do { $ok = 0; last; }; + } + if ($ok) { + s/^(.{$first_left_len}) /$1/ for @lines; + } else { + last; + } + } + } + + if ($lines[0]=~/^(#\s*\w+\s+)\(/) { + my $len = length($1); + for my $idx (1..$#lines) { + $lines[$idx]=~s/^([ ]{$len})(\s+)(\()/$1$3$2/; + } + } + my $ret= join "", map { "$_\n" } @lines; + return $ret; +} + +# this is the workhorse for _break_line_at_op(). +sub __break_line_at_op { + my ($self, $limit, $line, $blank_prefix)= @_; + my @lines= (""); + while (length $line) { + my $part; + if ($line =~ s/^(.*?(?:\|\||&&)\s+)//) { + $part= $1; + } + else { + $part= $line; + $line= ""; + } + if (length($lines[-1]) + length($part) < $limit) { + $lines[-1] .= $part; + } + else { + push @lines, $blank_prefix . $part; + } + } + return \@lines; +} + +# Break a condition line into parts, while trying to keep the last +# token on each line being an operator like || or && or ? or : We try +# to keep each line at $limit characters, however, we also try to +# ensure that each line has the same number of operators on it such +# that across all the lines there are only two counts of operators (eg, +# we either way each line to have two operators on it, or 0, or 1 or 0, +# or 2 or 1, and so on.) If we cannot meet this requirement we reduce +# the limit by 1 and try again, until we meet the objective, or the +# limit ends up at 70 chars or less. +sub _break_line_at_op { + my ($self, $limit, $line, $blank_prefix)= @_; + my $lines; + while (1) { + $lines= $self->__break_line_at_op($limit, $line, $blank_prefix); + my %op_counts; + foreach my $line_idx (0 .. $#$lines) { + my $line= $lines->[$line_idx]; + my $count= 0; + $count++ while $line =~ /(\|\||&&|\?|:)/g; + $op_counts{$count}++; + + } + if ($limit <= $self->{min_break_width} || keys(%op_counts) <= 2) { + last; + } + $limit--; + } + + s/\s*\z/\n/ for @$lines; + return join "", @$lines; +} + +sub _max { # cant use Scalar::Util so we roll our own + my $max= shift; + $max < $_ and $max= $_ for @_; + return $max; +} + +# take a condition, split into $type and $rest +# wrap it, and try to line up operators and defined() functions +# that it contains. This is rather horrible code, but it does a +# reasonable job applying the heuristics we need to lay our the +# conditions in a reasonable way. +sub _wrap_and_line_up_cond { + my ($self, $type, $rest)= @_; + + my $limit= $self->{max_width}; + + # extract the expression part of the line, and normalize it, we do + # this here even though it might be duplicative as it is possible + # that the caller code has munged the expression in some way, and we + # might want to simplify the expression first. Eg: + # 'defined(FOO) && (defined(BAR) && defined(BAZ))' should be turned into + # 'defined(FOO) && defined(BAR) && defined(BAZ)' if possible. + my $rest_head= ""; + my $rest_tail= ""; + if ($rest =~ s!(if\s+)!!) { + $rest_head= $1; + } + if ($rest =~ s!(\s*/\*.*?\*/)\s*\z!! || $rest =~ s!(\s*\*/\s*)\z!!) { + $rest_tail= $1; + } + if ($rest) { + $rest= $self->tidy_cond($rest); + $rest= $rest_head . $rest . $rest_tail; + } + + my $l= length($type); + my $line= $type; + $line .= $rest if length($rest); + my $blank_prefix= " " x $l; + + # at this point we have a single line with the entire expression on it + # if it fits on one line we are done, we can return it right away. + if (length($line) <= $limit) { + $line =~ s/\s*\z/\n/; + return $line; + } + my $rest_copy= $rest; + my @fragments; + my $op_pat= qr/(?:\|\||&&|[?:])/; + + # does the $rest contain a parenthesized group? If it does then + # there are a mixture of different ops being used, as if it was all + # the same opcode there would not be a parenthesized group. + # If it does then we handle it differently, and try to put the + # different parts of the expression on their own line. + if ($rest_copy =~ /$op_pat\s*\(/) { + my @parts; + while (length $rest_copy) { + if ($rest_copy =~ s/^(.*?$op_pat)(\s*!?\()/$2/) { + push @parts, $1; + } else { + #$rest_copy=~s/^\s+//; + push @parts, $rest_copy; + last; + } + } + $parts[0]= $type . $parts[0]; + $parts[$_]= $blank_prefix . $parts[$_] for 1 .. $#parts; + foreach my $line (@parts) { + if (length($line) > $limit) { + $line= $self->_break_line_at_op($limit, $line, $blank_prefix); + } + } + s/\s*\z/\n/ for @parts; + $line= join "", @parts; + @fragments= ("defined", "||"); + } + else { + # the expression consists of just one opcode type, so we can use + # simpler logic to break it apart with the objective of ensuring + # that the lines are similarly formed with trailing operators on + # each line but the last. + @fragments= ("||", "defined"); + $line= $self->_break_line_at_op($limit, $type . $rest, $blank_prefix); + } + + # try to line up the text on different lines. We stop after + # the first $fragment that modifies the text. The order + # of fragments we try is determined above based on the type + # of condition this is. + my $pre_line= $line; + foreach my $fragment (@fragments) { + $line= $self->_line_up_frag($line, $fragment); + last if $line ne $pre_line; + } + + # if we have lined up by "defined" in _line_up_frag() + # then we may have " || defined(...)" type expressions + # convert these to " || defined(...)" as it looks better. + $line =~ s/( )(\|\||&&|[()?:])([ ]{2,})(!?defined)/$3$2$1$4/g; + $line =~ s/(\|\||&&|[()?:])[ ]{10,}/$1 /g; + + # add back the line continuations. this is all pretty inefficient, + # but it works nicely. + my @lines= split /\n/, $line; + my $last= pop @lines; + my $max_len= _max(map { length $_ } @lines); + $_= sprintf "%*s \\\n", -$max_len, $_ for @lines; + $last .= "\n"; + + $line= join "", @lines, $last; + + # remove line continuations that are inside of a comment, + # we may have a variable number of lines of the expression + # or parts of lines of the expression in a comment, so + # we do this last. + $line =~ s!/\* (.*) \*/ + !"/*"._strip_line_cont("$1")."*/"!xsge; + + return $self->_fixup_indent($line); +} + +#remove line continuations from the argument. +sub _strip_line_cont { + my ($string)= @_; + $string =~ s/\s*\\\n/\n/g; + return $string; +} + +# Takes an array of HeaderLines objects produced by parse_fh() +# or by group_content(), and turn it into a string. +sub lines_as_str { + my ($self, $lines, $post_process_content)= @_; + $lines ||= $self->{lines}; + my $ret; + $post_process_content ||= $self->{post_process_content}; + my $filter= $self->{filter_content}; + my $last_line= ""; + + #warn $self->dd($lines); + foreach my $line_data (@$lines) { + my $line= $line_data->{line}; + if ($line_data->{type} ne "content" or $line_data->{sub_type} ne "text") + { + my $level= $line_data->{level}; + my $ind= $self->indent_chars($level); + $line =~ s/^#(\s*)/#$ind/; + } + if ($line_data->{type} eq "cond") { + my $add_commented_expr_after= $self->{add_commented_expr_after}; + if ($line_data->{sub_type} =~ /#(?:else|endif)/) { + my $joined= join " && ", + map { "($_)" } @{ $line_data->{cond}[-1] }; + my $cond_txt= $self->tidy_cond($joined); + $cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else"; + $line =~ s!\s*\z! /* $cond_txt */\n! + if $line_data->{inner_lines} >= $add_commented_expr_after; + } + elsif ($line_data->{sub_type} eq "#elif") { + my $last_frame= $line_data->{cond}[-1]; + my $joined= join " && ", + map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ]; + my $cond_txt= $self->tidy_cond($joined); + $line =~ s!\s*\z! /* && $cond_txt */\n! + if $line_data->{inner_lines} >= $add_commented_expr_after; + } + } + $line =~ s/\s*\z/\n/; + if ($last_line eq "\n" and $line eq "\n") { + next; + } + $last_line= $line; + if ($line_data->{type} eq "cond") { + $line =~ m!(^\s*#\s*\w+[ ]*)([^/].*?\s*)?(/\*.*)?\n\z! + or die "Failed to split cond line: $line"; + my ($type, $cond, $comment)= ($1, $2, $3); + $comment //= ""; + $cond //= ""; + my $new_line; + if (!length($cond) and $comment) { + $comment =~ s!^(/\*\s+)!! + and $type .= $1; + } + + $line= $self->_wrap_and_line_up_cond($type, $cond . $comment); + } + $line_data->{line}= $line; + if ($post_process_content and $line_data->{type} eq "content") { + $post_process_content->($self, $line_data); + } + if ($filter and $line_data->{type} eq "content") { + $filter->($self, $line_data) or next; + } + $ret .= $line_data->{line}; + } + return $ret; +} + +# Text::Wrap::wrap has an odd api, so hide it behind a wrapper +# sub which sets things up properly. +sub _my_wrap { + my ($head, $rest, $line)= @_; + local $Text::Wrap::unexpand= 0; + local $Text::Wrap::huge= "overflow"; + local $Text::Wrap::columns= 78; + unless (length $line) { return $head } + $line= wrap $head, $rest, $line; + return $line; +} + +# recursively extract the && expressions from a parse tree, +# returning the result as strings. +# if $node is not a '&&' op then it returns $node as a string, +# otherwise it returns the string form of the arguments to the +# '&&' op, recursively flattening any '&&' nodes that it might +# contain. +sub _and_clauses { + my ($self, $node)= @_; + + my @ret; + if (@$node < 3 or $node->[0] ne "&&") { + return $self->_pt_as_str($node); + } + foreach my $idx (1 .. $#$node) { + push @ret, $self->_and_clauses($node->[$idx]); + } + return @ret; +} + +# recursively walk the a parse tree, and return the literal +# terms it contains, ignoring any operators in the optree. +sub _terms { + my ($self, $node)= @_; + if (@$node == 1) { + return $self->_pt_as_str($node); + } + my @ret; + if (@$node == 2) { + if ($node->[0] eq "?") { + push @ret, map { $self->_terms($_) } @{ $node->[1] }; + } + else { + push @ret, $self->_terms($node->[1]); + } + } + else { + foreach my $i (1 .. $#$node) { + push @ret, $self->_terms($node->[$i]); + } + } + return @ret; +} + +# takes a HeaderLine "cond" AoA and flattens it into +# a single expression, and then extracts all the and clauses +# it contains. Thus [['defined(A)'],['defined(B)']] and +# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)'] +sub _flatten_cond { + my ($self, $cond_ary)= @_; + + my $expr= join " && ", map { + map { "($_)" } + @$_ + } @$cond_ary; + return [] unless $expr; + my $tree= $self->parse_expr($expr); + my %seen; + my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree); + return \@and_clause; +} + +# Find the best path into a tree of conditions, such that +# we reuse the maximum number of existing branches. Returning +# two arrays, the first contain the parts of $cond_array that +# make up the best path, in the best path order, and a second array +# with the remaining items in the initial order they were provided. +# Thus if we have previously stored only the path "A", "B", "C" +# into the tree, and want to find the best path for +# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"], +# +# This used to reduce the number of conditions in the grouped content, +# and is especially helpful with dealing with DEBUGGING related +# functionality. It is coupled with careful control over the order +# that we add paths and conditions to the tree. +sub _best_path { + my ($self, $tree_node, $cond_array, @path)= @_; + my $best= \@path; + my $rest= $cond_array; + foreach my $cond (@$cond_array) { + if ($tree_node->{$cond}) { + my ($new_best, $new_rest)= + $self->_best_path($tree_node->{$cond}, + [ grep $_ ne $cond, @$cond_array ], + @path, $cond); + if (@$new_best > @$best) { + ($best, $rest)= ($new_best, $new_rest); + } + } + } + if (@$best == @path) { + foreach my $cond (@$cond_array) { + my $not_cond= $self->tidy_cond("!($cond)"); + if ($tree_node->{$not_cond}) { + $best= [ @path, $cond ]; + $rest= [ grep $_ ne $cond, @$cond_array ]; + last; + } + } + } + return ($best, $rest); +} + +# This builds a group content tree from a set of lines. each content line in +# the original file is added to the file based on the conditions that apply to +# the content. +# +# The tree is made up of nested HoH's with keys in the HoH being normalized +# clauses from the {cond} data in the HeaderLine objects. +# +# Care is taken to minimize the number of pathways and to reorder clauses to +# reuse existing pathways and minimize the total number of conditions in the +# file. +# +# The '' key of a hash contains an array of the lines that are part of the +# condition that lead to that key. Thus lines with no conditions are in +# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be +# in $tree{"defined(A)"}{"defined(B)"}{""}. +# +# The result of this sub is normally passed into __recurse_group_content_tree() +# which converts it back into a set of HeaderLine objects. +# +sub _build_group_content_tree { + my ($self, $lines)= @_; + $lines ||= $self->{lines}; + my $filter= $self->{filter_content}; + my %seen_normal; + foreach my $line_data (@$lines) { + next if $line_data->{type} ne "content"; + next if $filter and !$filter->($self, $line_data); + my $cond_frames= $line_data->{cond}; + my $cond_frame= $self->_flatten_cond($cond_frames); + my $flat_merged= join " && ", map "($_)", @$cond_frame; + my $normalized; + if (@$cond_frame) { + $normalized= $self->tidy_cond($flat_merged); + } + else { + $normalized= $flat_merged; # empty string + } + push @{ $seen_normal{$normalized} }, $line_data; + } + my @debugging; + my @non_debugging; + foreach my $key (keys %seen_normal) { + if ($key =~ /DEBUGGING/) { + push @debugging, $key; + } + else { + push @non_debugging, $key; + } + } + @non_debugging= + sort { length($a) <=> length($b) || $a cmp $b } @non_debugging; + @debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging; + my %tree; + foreach my $normal_expr (@non_debugging, @debugging) { + my $all_line_data= $seen_normal{$normal_expr}; + + my $cond_frame= + (length $normal_expr) + ? $self->_flatten_cond([ [$normal_expr] ]) + : []; + @$cond_frame= $self->_sort_terms(@$cond_frame); + my $node= \%tree; + my ($best, $rest)= $self->_best_path($node, $cond_frame); + die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame + unless @$best + @$rest == @$cond_frame; + + foreach my $cond (@$best, @$rest) { + $node= $node->{$cond} ||= {}; + } + push @{ $node->{''} }, @$all_line_data; + } + + warn $self->dd(\%tree) if $self->{debug}; + $self->{tree}= \%tree; + return \%tree; +} + +sub _recurse_group_content_tree { + my ($self, $node, @path)= @_; + + my @ret; + local $self->{rgct_ret}= \@ret; + local $self->{line_by_depth}= []; + + $self->__recurse_group_content_tree($node, @path); + return \@ret; +} + +# convert a tree of conditions constructed by _build_group_content_tree() +# and turn it into a set of HeaderLines that represents it. Performs the +# appropriate sets required to reconstitute an if/elif/elif/else sequence +# by calling _handle_else(). +sub __recurse_group_content_tree { + my ($self, $node, @path)= @_; + my $depth= 0 + @path; + my $ind= $self->indent_chars($depth); + my $ret= $self->{rgct_ret}; + if ($node->{''}) { + if (my $cb= $self->{post_process_grouped_content}) { + $cb->($self, $node->{''}, \@path); + } + if (my $cb= $self->{post_process_content}) { + $cb->($self, $_, \@path) for @{ $node->{''} }; + } + push @$ret, map { + HeaderLine->new( + %$_, + cond => [@path], + level => $depth, + start_line_num => 0 + @$ret + ) + } @{ $node->{''} }; + } + + my %skip; + foreach my $expr ( + map { $_->[0] } + sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] } + map { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node + ) { + next unless length $expr; # ignore payload + my $not= $self->tidy_cond("!($expr)"); + if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) { + next; + } + my $kid= $node->{$expr}; + while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) { + my ($kid_key)= keys(%$kid); + $expr= $self->tidy_cond("($expr) && ($kid_key)"); + $kid= $kid->{$kid_key}; + my $new_not= $self->tidy_cond("!($expr)"); + if ($node->{$new_not}) { + $not= $new_not; + $skip{$not}++; + } + } + my $raw= "#${ind}if $expr\n"; + my $hl= HeaderLine->new( + type => "cond", + sub_type => "#if", + raw => $raw, + line => $raw, + level => $depth, + cond => [ @path, [$expr] ], + start_line_num => 0 + @$ret, + ); + $self->{line_by_depth}[$depth]= 0 + @$ret; + push @$ret, $hl; + $self->__recurse_group_content_tree($kid, @path, [$expr]); + if ($node->{$not}) { + $skip{$not}++; + $self->_handle_else($not, $node->{$not}, $ind, $depth, @path, + [$not]); + } + + # and finally the #endif + + $raw= "#${ind}endif\n"; + + # we need to extract the condition information from the last line in @ret, + # as we don't know which condition we are ending here. It could be an elsif + # from deep in the parse tree for instance. + # So we need to extract the last frame from the cond structure in the last + # line-info in @ret. + # BUT if this last line is itself an #endif, then we need to take the second + # to last line instead, as the endif would have "popped" that frame off the + # condition stack. + my $last_ret= $ret->[-1]; + my $idx= + ($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif") + ? -2 + : -1; + my $end_line= HeaderLine->new( + type => "cond", + sub_type => "#endif", + raw => $raw, + line => $raw, + level => $depth, + cond => [ @path, $last_ret->{cond}[$idx] ], + start_line_num => 0 + @$ret, + inner_lines => @$ret - $self->{line_by_depth}[$depth], + ); + undef $self->{line_by_depth}[$depth]; + push @$ret, $end_line; + } + return $ret; +} + +# this handles the specific case of an else clause, detecting +# when an elif can be constructed, may recursively call itself +# to deal with if/elif/elif/else chains. Calls back into +# __recurse_group_content_tree(). +sub _handle_else { + my ($self, $not, $kid, $ind, $depth, @path)= @_; + + # extract the first 3 keys - from this we can detect + # which of the three scenarios we have to handle. + my ($k1, $k2, $k3)= + sort { length($a) <=> length($b) || $a cmp $b } keys %$kid; + my $not_k1; + if (length($k1) and defined($k2) and !defined($k3)) { + + # if we do not have a payload (length($k1)) and we have exactly + # two keys (defined($k2) and !defined($k3)) we need to compute + # the inverse of $k1, which we will use later. + $not_k1= $self->tidy_cond("!($k1)"); + } + my $ret= $self->{rgct_ret}; + if (length($k1) and !defined($k2)) { + + # only one child, no payload -> elsif $k1 + my $sub_expr; + do { + $sub_expr= + !$sub_expr + ? $k1 + : $self->tidy_cond("($sub_expr) && ($k1)"); + $kid= $kid->{$k1}; + ($k1, $k2)= + sort { length($a) <=> length($b) || $a cmp $b } keys %$kid; + } while length($k1) and !defined $k2; + + my $raw= "#${ind}elif $sub_expr\n"; + push @{ $path[-1] }, $sub_expr; + my $hl= HeaderLine->new( + type => "cond", + sub_type => "#elif", + raw => $raw, + line => $raw, + level => $depth, + cond => [ map { [@$_] } @path ], + start_line_num => 0 + @$ret, + inner_lines => @$ret - $self->{line_by_depth}[$depth], + ); + $self->{line_by_depth}[$depth]= 0 + @$ret; + push @$ret, $hl; + $self->__recurse_group_content_tree($kid, @path); + } + elsif (defined($not_k1) and $not_k1 eq $k2) { + + # two children which are complementary, no payload -> elif $k1 else.. + my $raw= "#${ind}elif $k1\n"; + + push @{ $path[-1] }, $k1; + my $hl= HeaderLine->new( + type => "cond", + sub_type => "#elif", + raw => $raw, + line => $raw, + level => $depth, + cond => [ map { [@$_] } @path ], + start_line_num => 0 + @$ret, + inner_lines => @$ret - $self->{line_by_depth}[$depth], + ); + $self->{line_by_depth}[$depth]= 0 + @$ret; + push @$ret, $hl; + $self->__recurse_group_content_tree($kid->{$k1}, @path); + $path[-1][-1]= $k2; + $self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path); + } + else { + # payload, 3+ children, or 2 which are not complementary -> else + my $raw= "#${ind}else\n"; + my $hl= HeaderLine->new( + type => "cond", + sub_type => "#else", + raw => $raw, + line => $raw, + level => $depth, + cond => [ map { [@$_] } @path ], + start_line_num => 0 + @$ret, + inner_lines => @$ret - $self->{line_by_depth}[$depth], + ); + $self->{line_by_depth}[$depth]= 0 + @$ret; + push @$ret, $hl; + $self->__recurse_group_content_tree($kid, @path); + } + return $ret; +} + +# group the content in lines by the condition that apply to them +# returns a set of lines representing the new structure +sub group_content { + my ($self, $lines, $filter)= @_; + $lines ||= $self->{lines}; + local $self->{filter_content}= $filter || $self->{filter_content}; + my $tree= $self->_build_group_content_tree($lines); + return $self->_recurse_group_content_tree($tree); +} + +#read a file by name - opens the file and passes the fh into parse_fh(). +sub read_file { + my ($self, $file_name, $callback)= @_; + $self= $self->new() unless ref $self; + local $self->{parse_source}= $file_name; + open my $fh, "<", $file_name + or confess "Failed to open '$file_name' for read: $!"; + my $lines= $self->parse_fh($fh); + if ($callback) { + foreach my $line (@$lines) { + $callback->($self, $line); + } + } + return $self; +} + +# These are utility methods for the HeaderLine objects. +sub HeaderLine::new { + my ($class, %self)= @_; + return bless \%self, $class; +} +sub HeaderLine::cond { $_[0]->{cond} } # AoA +sub HeaderLine::type { $_[0]->{type} } +sub HeaderLine::type_is { return $_[0]->type eq $_[1] ? 1 : 0 } +sub HeaderLine::sub_type { $_[0]->{sub_type} } +sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 } +sub HeaderLine::raw { $_[0]->{raw} } +sub HeaderLine::flat { $_[0]->{flat} } +sub HeaderLine::line { $_[0]->{line} } +sub HeaderLine::level { $_[0]->{level} } +sub HeaderLine::is_content { return $_[0]->type_is("content") } +sub HeaderLine::is_cond { return $_[0]->type_is("cond") } +sub HeaderLine::is_define { return $_[0]->sub_type_is("#define") } +sub HeaderLine::line_num { $_[0]->{start_line_num} } +sub HeaderLine::inner_lines { $_[0]->{inner_lines} } +sub HeaderLine::n_lines { $_[0]->{n_lines} } +sub HeaderLine::embed { $_[0]->{embed} } +*HeaderLine::start_line_num= *HeaderLine::line_num; + +# these are methods for EmbedLine objects +*EmbedLine::new= *HeaderLine::new; +sub EmbedLine::flags { $_[0]->{flags} } +sub EmbedLine::return_type { $_[0]->{return_type} } +sub EmbedLine::name { $_[0]->{name} } +sub EmbedLine::args { $_[0]->{args} } # array ref + +1; + +__END__ + +=head1 NAME + +HeaderParser - A minimal header file parser that can be hooked by other porting +scripts. + +=head1 SYNOPSIS + + my $o= HeaderParser->new(); + my $lines= $o->parse_fh($fh); + +=head1 DESCRIPTION + +HeaderParser is a tool to parse C preprocessor header files. The tool +understands the syntax of preprocessor conditions, and is capable of creating +a parse tree of the expressions involved, and normalizing them as well. + +C preprocessor files are a bit tricky to parse properly, especially with a +"line by line" model. There are two issues that must be dealt with: + +=over 4 + +=item Line Continuations + +Any line ending in "\\\n" (that is backslash newline) is considered to be part +of a longer string which continues on the next line. Processors should replace +the "\\\n" typically with a space when converting to a "real" line. + +=item Comments Acting As A Line Continuation + +The rules for header files stipulates that C style comments are stripped +before processing other content, this means that comments can serve as a form +of line continuation: + + #if defined(foo) /* + */ && defined(bar) + +is the same as + + #if defined(foo) && defined(bar) + +This type of comment usage is often overlooked by people writing header file +parsers for the first time. + +=item Indented pre processor directives. + +It is easy to forget that there may be multiple spaces between the "#" +character and the directive. It also easy to forget that there may be spaces +in *front* of the "#" character. Both of these cases are often overlooked. + +=back + +The main idea of this module is to provide a single framework for correctly +parsing the content of our header files in a consistent manner. A secondary +purpose it to make various tasks we want to do easier, such as normalizing +content or preprocessor expressions, or just extracting the real "content" of +the file properly. + +=head2 parse_fh + +This function parses a filehandle into a set of lines. Each line is represented by a hash +based object which contains the following fields: + + bless { + cond => [['defined(a)'],['defined(b)']], + type => "content", + sub_type => undef, + raw => $raw_content_of_line, + line => $normalized_content_of_line, + level => $level, + source => $filename_or_string, + start_line_num => $line_num_for_first_line, + n_lines => $line_num - $line_num_for_first_line, + }, "HeaderLine" + +A "line" in this context is a logical line, and because of line continuations +and comments may contain more than one physical line, and thus more than +one newline, but will always include at least one and will always end with one +(unless there is no newline at the end of the file). Thus + + before /* + this is a comment + */ after \ + and continues + +will be treated as a single logical line even though the content is +spread over four lines. + +=over 4 + +=item cond + +An array of arrays containing the normalized expressions of any C preprocessor +conditional blocks which include the line. Each line has its own copy of the +conditions it was operated on currently, but that may change so dont alter +this data. The inner arrays may contain more than one element. If so then the +line is part of an "#else" or "#elsif" and the clauses should be considered to +be a conjuction when considering "when is this line included", however when +considered as part of an if/elsif/else, each added clause represents the most +recent condition. In the following you can see how: + + before /* cond => [ ] */ + #if A /* cond => [ ['A'] ] */ + do-a /* cond => [ ['A'] ] */ + #elif B /* cond => [ ['!A', 'B'] ] */ + do-b /* cond => [ ['!A', 'B'] ] */ + #else /* cond => [ ['!A', '!B'] ] */ + do-c /* cond => [ ['!A', '!B'] ] */ + # if D /* cond => [ ['!A', '!B'], ['D'] ] */ + do-d /* cond => [ ['!A', '!B'], ['D'] ] */ + # endif /* cond => [ ['!A', '!B'], ['D'] ] */ + #endif /* cond => [ ['!A', '!B'] ] */ + after /* cond => [ ] */ + +So in the above we can see how the three clauses of the if produce +a single "frame" in the cond array, but that frame "grows" and changes +as additional else clauses are added. When an entirely new if block +is started (D) it gets its own block. Each endif includes the clause +it terminates. + +=item type + +This value indicates the type of the line. This may be one of the following: +'content', 'cond', 'define', 'include' and 'error'. Several of the types +have a sub_type. + +=item sub_type + +This value gives more detail on the type of the line where necessary. +Not all types have a subtype. + + Type | Sub Type + --------+---------- + content | text + | include + | define + | error + cond | #if + | #elif + | #else + | #endif + +Note that there are no '#ifdef' or '#elifndef' or similar expressions. All +expressions of that form are normalized into the '#if defined' form to +simplify processing. + +=item raw + +This was the raw original text before HeaderParser performed any modifications +to it. + +=item line + +This is the normalized and modified text after HeaderParser or any callbacks +have processed it. + +=item level + +This is the "indent level" of a line and corresponds to the number of blocks +that the line is within, not including any blocks that might be created by +the line itself. + + before /* level => 0 */ + #if A /* level => 0 */ + do-a /* level => 1 */ + #elif B /* level => 0 */ + do-b /* level => 1 */ + #else /* level => 0 */ + do-c /* level => 1 */ + # if D /* level => 1 */ + do-d /* level => 2 */ + # endif /* level => 1 */ + #endif /* level => 0 */ + after /* level => 0 */ + +=back + +parse_fh() will throw an exception if it encounters a malformed expression +or input it cannot handle. + +=head2 lines_as_str + +This function will return a string representation of the lines it is provided. + +=head2 group_content + +This function will group the text in the file by the conditions which contain +it. This is only useful for files where the content is essentially a list and +where changing the order that lines are output in will not break the resulting +file. + +Each content line will be grouped into a structure of nested if/else blocks +(elif will produce a new nested block) such that the content under the control +of a given set of normalized condition clauses are grouped together in the order +the occurred in the file, such that each combined conditional clause is output +only once. + +This means a file like this: + + #if A + A + #elif K + AK + #else + ZA + #endif + #if B && Q + B + #endif + #if Q && B + BC + #endif + #if A + AD + #endif + #if !A + ZZ + #endif + +Will end up looking roughly like this: + + #if A + A + AD + #else + ZZ + # if K + AK + # else + ZA + # endif + #endif + #if B && Q + B + BC + #endif + +Content at a given block level always goes before conditional clauses +at the same nesting level. + +=head2 HOOKS + +There are severals hooks that are available, C<pre_process_content> and +C<post_process_content>, and C<post_process_grouped_content>. All of these +hooks will be called with the HeaderParser object as the first argument. +The "process_content" callbacks will be called with a line hash as the second +argument, and C<post_process_grouped_content> will be called with an +array of line hashes for the content in that group, so that the array may be +modified or sorted. Callbacks called from inside of C<group_content()> +(that is C<post_process_content> and C<post_process_grouped_content> will be +called with an additional argument containing and array specifying the actual +conditional "path" to the content (which may differ somewhat from the data in +a lines "cond" property). + +These hooks may do what they like, but generally they will modify the +"line" property of the line hash to change the final output returned +by C<lines_as_str()> or C<group_content()>. + +=head2 FORMATTING AND INDENTING + +Header parser tries hard to produce neat and readable output with a consistent +style and form. For example: + + #if defined(FOO) + # define HAS_FOO + # if defined(BAR) + # define HAS_FOO_AND_BAR + # else /* !defined(BAR) */ + # define HAS_FOO_NO_BAR + # endif /* !defined(BAR) */ + #endif /* defined(FOO) */ + +HeaderParser uses two space tab stops for indenting C pre-processor +directives. It puts the spaces between the "#" and the directive. The "#" is +considered "part" of the indent, even though the space comes after it. This +means the first indent level "looks" like one space, and following indents +look like 2. This should match what a sensible editor would do with two space +tab stops. The C<indent_chars()> method can be used to convert an indent level +into a string that contains the appropriate number of spaces to go in between +the "#" and the directive. + +When emitting "#endif", "#elif" and "#else" directives comments will be +emitted also to show the conditions that apply. These comments may be wrapped +to cover multiple lines. Some effort is made to get these comments to line up +visually, but it uses heuristics which may not always produce the best result. + +=cut |