#!/usr/bin/perl -w use strict; # TODO # - {-1} last column. # # - {50} # - {50-100} # - {50-100:f} array join string by key # - {50-100|f} array bracket string by key # - {50.100.f} scalar concatenate by key, same as {50}.f.{100} # - {(arithmetic-expression)} # - {(a1)-(a2):f} ranges with arithmetic expressions # - fprintf-style padding: # - {X,50,0} # - ,50-100, range # - 1.2 concatenate 1 and 2 # - increment/change/lowercase/uppercase file entries (no, featurecreep) # - regular expression option to set @F. # ? do sth (e.g. bracket) to each in a range of columns? # - warn if selection exceeds column count # - warn if different rows have different column counts. # - enable arbitrary column counts. # - customize splits my %insert_g = (); my %prefix_g = (); my %postfix_g = (); my %counter_g = (); my %increment_g = (); my %column_map_g = (); # e.g. "liver" => 3 my %column_handle_g = (); # e.g. 'a' => "liver" my $arg = ""; my $sep_g = "\t"; my $strip_g = '#'; my $join_g = "\t"; my $count = 0; my $header_num_g = 0; my $header_keep_g = 0; if (!@ARGV || $ARGV[0] eq '--help') { print < + recol --count= [options] Key \\w is shorthand for any character in A-Za-z1-9 [] indicate character classes + indicates a literal '+' = ^= _= := %= all are literal strings Options [a-z]= creates a handle to a constant string, e.g. d=shn :[a-z]= create handle to column header e.g. l=liver +[a-z]= creates a handle that is autoincremented: +c=20 +[a-z]=, same, specify increment: +c=20,5 \\w^= prepend to column, e.g. '2^=>>' \\w_= append to column, e.g. '2_=!' \\w:= surround column with \\w%= surround column with smart - maps {[(< to >)]} --space split input on whitespace, not tabs --sep= split input on --join= join output on --count= iterate times (implies no file/use + specifiers) --header= discard lines, take column names from line . --leader= take column names from line . --strip= strip lines with ^\\s* 1-9A-Z index columns a-z index insert columns . concatenate surrounding columns * all columns after and including - all columns - EOH exit 0; } my %close_bracket = ( '{' => '}' , '(' => ')' , '[' => ']' , '<' => '>' , '}' => '{' , ')' => '(' , ']' => '[' , '>' => '<' ) ; sub closit { my $rev = ""; for my $s (reverse split "", $_[0]) { my $t = $close_bracket{$s}; $t = $s unless defined($t); $rev .= $t; } return $rev; } while ($arg = shift @ARGV) { if ($arg =~ /^([a-z])=(.*)/) { $insert_g{$1} = $2; } elsif ($arg =~ /\:([a-z])=(\S+)/) { $column_handle_g{$1} = $2; } elsif ($arg =~ /^([1-9A-Z])\^=(.*)/) { $prefix_g{$1} = $2; } elsif ($arg =~ /^([1-9a-zA-Z])_=(.*)/) { $postfix_g{$1} = $2; } elsif ($arg =~ /^([1-9a-zA-Z])%=(.*)/) { $prefix_g{$1} = $2; $postfix_g{$1} = closit($2); } elsif ($arg =~ /^([1-9a-zA-Z]):=(.*)/) { $prefix_g{$1} = $2; $postfix_g{$1} = $2; } elsif ($arg =~ /\+([a-z])=(\d+),(\d+)/) { $counter_g{$1} = $2; $increment_g{$1} = $3; } elsif ($arg =~ /\+([a-z])=(\d+)/) { $counter_g{$1} = $2; } elsif ($arg =~ /--space/) { $sep_g = undef; } elsif ($arg =~ /--sep=(.*)/) { $sep_g = $1; } elsif ($arg =~ /--header=(\d+)/) { $header_num_g = $1; } elsif ($arg =~ /--leader=(\d+)/) { $header_num_g = $1; $header_keep_g = 1; } elsif ($arg =~ /--strip=(.*)/) { $strip_g = $1; } elsif ($arg =~ /--join=(.*)/) { $join_g = $1; } elsif ($arg =~ /--count=(\d+)/) { $count=$1; } else { last; } $arg = ""; } if (keys %column_handle_g && !$header_num_g) { die "cannot infer handles without header"; } sub thumbit { my $s = shift; if (length($s) != 1) { } elsif ($s =~ /[A-Z]/) { return ord($s) - ord('A') + 10; } elsif ($s =~ /[1-9]/) { return +$s; } return 0; } my @spec = split "", $arg; my @map_g = (); my $append = 0; my $n_mapped = 0; while (my $s = shift @spec) { if ($s =~ /[a-z]/) { push @map_g, { TYPE => 'KEY', LEFT => undef }; } elsif ($s =~ /[1-9A-Z]/) { my $i = thumbit($s); push @map_g, { TYPE => 'INDEX', LEFT => $i-1 }; } elsif ($s eq '*' && @map_g) { push @map_g, { TYPE => 'TAIL', LEFT => $map_g[-1]{LEFT}+1 }; } elsif ($s eq '-') { my $s2 = shift @spec; if ($s2 && @map_g && $map_g[-1]{TYPE} eq 'INDEX') { } else { die "error in range specification (left)"; } my $s3 = thumbit($s2); if (!$s3 || $s3 < $map_g[-1]{LEFT}+1) { die "error in range specification (right)"; } push @map_g, { TYPE => 'RANGE', LEFT => $map_g[-1]{LEFT}+1, RIGHT => $s3 -1 }; } if ($append && @map_g > 1) { $map_g[-1]{APPEND} = 1; $append = 0; } elsif ($s eq '.') { # nothing pushed. $append = 1; } # fixdoc document this! if ($n_mapped < @map_g) { $map_g[-1]{SHORTHAND} = $s; $n_mapped = @map_g; } } local $, = $join_g; sub mappit { my @F; if (!defined($sep_g)) { @F = split /\s+/, $_[0]; } else { @F = split $sep_g, $_[0]; } my $F_n = @F; my @out = (); my $mm = undef; for my $m (@map_g) { my ($o, $pre, $post) = ("", "", ""); my ($sh) = map { $m->{$_} } qw ( SHORTHAND ); my $scalar = 0; $pre = $prefix_g{$sh} if defined($prefix_g{$sh}); $post = $postfix_g{$sh} if defined($postfix_g{$sh}); if ($m->{TYPE} eq 'KEY') { if (defined($counter_g{$sh})) { $o = $counter_g{$sh}; my $inc = $increment_g{$sh}; $inc = 1 unless defined($inc); $counter_g{$sh} += $inc; } elsif (defined($column_handle_g{$sh})) { my $i = $column_map_g{$column_handle_g{$sh}}; die "no column for handle $sh => $column_handle_g{$sh}" unless defined($i); if ($i > @F) { warn "$i exceeds column count $F_n"; } else { $o = $F[$i]; } } else { $o = $insert_g{$sh}; } $scalar = 1; } elsif ($m->{TYPE} eq 'INDEX') { my $i = $m->{LEFT}; if ($i > @F) { warn "$i exceeds column count $F_n"; } else { $o = $F[$i]; } $scalar = 1; } # ^ # fixme: scalars above array below. # _ elsif ($m->{TYPE} eq 'RANGE') { my ($left, $right) = ($m->{LEFT}, $m->{RIGHT}); $right = $#F if $right > $#F; $left = 0 if $left < 0; push @out, @F[$left..$right] if $left <= $right; } elsif ($m->{TYPE} eq 'TAIL' && $m->{LEFT} < @F) { push @out, @F[$m->{LEFT}..@F-1]; } if ($scalar) { if (defined($m->{APPEND})) { $out[-1] .= "$pre$o$post"; } else { push @out, "$pre$o$post"; } } } print @out; print "\n"; } if ($count) { while (--$count >= 0) { mappit(""); } } else { my @header = (); while ($header_num_g) { my $line = <>; push @header, $line if $header_keep_g; chomp $line; if ($header_num_g == 1) { my @fields; if ($line =~ /\t/) { @fields = split "\t", $line; } else { @fields = split /\s+/, $line; } for (my $i=0;$i<@fields;$i++) { $column_map_g{$fields[$i]} = $i; } } $header_num_g--; } while ($_ = shift @header) { chomp; next if $strip_g && /^\s*$strip_g/; mappit($_); } while (<>) { chomp; next if $strip_g && /^\s*$strip_g/; mappit($_); } }