112b31560SH. Peter Anvin#!/usr/bin/perl -w 212b31560SH. Peter Anvin# 312b31560SH. Peter Anvin# Clean a text file -- or directory of text files -- of stealth whitespace. 412b31560SH. Peter Anvin# WARNING: this can be a highly destructive operation. Use with caution. 512b31560SH. Peter Anvin# 612b31560SH. Peter Anvin 712b31560SH. Peter Anvinuse bytes; 812b31560SH. Peter Anvinuse File::Basename; 912b31560SH. Peter Anvin 10*cb3ed5b7SH. Peter Anvin# Default options 11*cb3ed5b7SH. Peter Anvin$max_width = 79; 12*cb3ed5b7SH. Peter Anvin 1312b31560SH. Peter Anvin# Clean up space-tab sequences, either by removing spaces or 1412b31560SH. Peter Anvin# replacing them with tabs. 1512b31560SH. Peter Anvinsub clean_space_tabs($) 1612b31560SH. Peter Anvin{ 1712b31560SH. Peter Anvin no bytes; # Tab alignment depends on characters 1812b31560SH. Peter Anvin 1912b31560SH. Peter Anvin my($li) = @_; 2012b31560SH. Peter Anvin my($lo) = ''; 2112b31560SH. Peter Anvin my $pos = 0; 2212b31560SH. Peter Anvin my $nsp = 0; 2312b31560SH. Peter Anvin my($i, $c); 2412b31560SH. Peter Anvin 2512b31560SH. Peter Anvin for ($i = 0; $i < length($li); $i++) { 2612b31560SH. Peter Anvin $c = substr($li, $i, 1); 2712b31560SH. Peter Anvin if ($c eq "\t") { 2812b31560SH. Peter Anvin my $npos = ($pos+$nsp+8) & ~7; 2912b31560SH. Peter Anvin my $ntab = ($npos >> 3) - ($pos >> 3); 3012b31560SH. Peter Anvin $lo .= "\t" x $ntab; 3112b31560SH. Peter Anvin $pos = $npos; 3212b31560SH. Peter Anvin $nsp = 0; 3312b31560SH. Peter Anvin } elsif ($c eq "\n" || $c eq "\r") { 3412b31560SH. Peter Anvin $lo .= " " x $nsp; 3512b31560SH. Peter Anvin $pos += $nsp; 3612b31560SH. Peter Anvin $nsp = 0; 3712b31560SH. Peter Anvin $lo .= $c; 3812b31560SH. Peter Anvin $pos = 0; 3912b31560SH. Peter Anvin } elsif ($c eq " ") { 4012b31560SH. Peter Anvin $nsp++; 4112b31560SH. Peter Anvin } else { 4212b31560SH. Peter Anvin $lo .= " " x $nsp; 4312b31560SH. Peter Anvin $pos += $nsp; 4412b31560SH. Peter Anvin $nsp = 0; 4512b31560SH. Peter Anvin $lo .= $c; 4612b31560SH. Peter Anvin $pos++; 4712b31560SH. Peter Anvin } 4812b31560SH. Peter Anvin } 4912b31560SH. Peter Anvin $lo .= " " x $nsp; 5012b31560SH. Peter Anvin return $lo; 5112b31560SH. Peter Anvin} 5212b31560SH. Peter Anvin 53*cb3ed5b7SH. Peter Anvin# Compute the visual width of a string 54*cb3ed5b7SH. Peter Anvinsub strwidth($) { 55*cb3ed5b7SH. Peter Anvin no bytes; # Tab alignment depends on characters 56*cb3ed5b7SH. Peter Anvin 57*cb3ed5b7SH. Peter Anvin my($li) = @_; 58*cb3ed5b7SH. Peter Anvin my($c, $i); 59*cb3ed5b7SH. Peter Anvin my $pos = 0; 60*cb3ed5b7SH. Peter Anvin my $mlen = 0; 61*cb3ed5b7SH. Peter Anvin 62*cb3ed5b7SH. Peter Anvin for ($i = 0; $i < length($li); $i++) { 63*cb3ed5b7SH. Peter Anvin $c = substr($li,$i,1); 64*cb3ed5b7SH. Peter Anvin if ($c eq "\t") { 65*cb3ed5b7SH. Peter Anvin $pos = ($pos+8) & ~7; 66*cb3ed5b7SH. Peter Anvin } elsif ($c eq "\n") { 67*cb3ed5b7SH. Peter Anvin $mlen = $pos if ($pos > $mlen); 68*cb3ed5b7SH. Peter Anvin $pos = 0; 69*cb3ed5b7SH. Peter Anvin } else { 70*cb3ed5b7SH. Peter Anvin $pos++; 71*cb3ed5b7SH. Peter Anvin } 72*cb3ed5b7SH. Peter Anvin } 73*cb3ed5b7SH. Peter Anvin 74*cb3ed5b7SH. Peter Anvin $mlen = $pos if ($pos > $mlen); 75*cb3ed5b7SH. Peter Anvin return $mlen; 76*cb3ed5b7SH. Peter Anvin} 77*cb3ed5b7SH. Peter Anvin 7812b31560SH. Peter Anvin$name = basename($0); 7912b31560SH. Peter Anvin 80*cb3ed5b7SH. Peter Anvin@files = (); 81*cb3ed5b7SH. Peter Anvin 82*cb3ed5b7SH. Peter Anvinwhile (defined($a = shift(@ARGV))) { 83*cb3ed5b7SH. Peter Anvin if ($a =~ /^-/) { 84*cb3ed5b7SH. Peter Anvin if ($a eq '-width' || $a eq '-w') { 85*cb3ed5b7SH. Peter Anvin $max_width = shift(@ARGV)+0; 86*cb3ed5b7SH. Peter Anvin } else { 87*cb3ed5b7SH. Peter Anvin print STDERR "Usage: $name [-width #] files...\n"; 88*cb3ed5b7SH. Peter Anvin exit 1; 89*cb3ed5b7SH. Peter Anvin } 90*cb3ed5b7SH. Peter Anvin } else { 91*cb3ed5b7SH. Peter Anvin push(@files, $a); 92*cb3ed5b7SH. Peter Anvin } 93*cb3ed5b7SH. Peter Anvin} 94*cb3ed5b7SH. Peter Anvin 95*cb3ed5b7SH. Peter Anvinforeach $f ( @files ) { 9612b31560SH. Peter Anvin print STDERR "$name: $f\n"; 9712b31560SH. Peter Anvin 9812b31560SH. Peter Anvin if (! -f $f) { 9912b31560SH. Peter Anvin print STDERR "$f: not a file\n"; 10012b31560SH. Peter Anvin next; 10112b31560SH. Peter Anvin } 10212b31560SH. Peter Anvin 10312b31560SH. Peter Anvin if (!open(FILE, '+<', $f)) { 10412b31560SH. Peter Anvin print STDERR "$name: Cannot open file: $f: $!\n"; 10512b31560SH. Peter Anvin next; 10612b31560SH. Peter Anvin } 10712b31560SH. Peter Anvin 10812b31560SH. Peter Anvin binmode FILE; 10912b31560SH. Peter Anvin 11012b31560SH. Peter Anvin # First, verify that it is not a binary file; consider any file 11112b31560SH. Peter Anvin # with a zero byte to be a binary file. Is there any better, or 11212b31560SH. Peter Anvin # additional, heuristic that should be applied? 11312b31560SH. Peter Anvin $is_binary = 0; 11412b31560SH. Peter Anvin 11512b31560SH. Peter Anvin while (read(FILE, $data, 65536) > 0) { 11612b31560SH. Peter Anvin if ($data =~ /\0/) { 11712b31560SH. Peter Anvin $is_binary = 1; 11812b31560SH. Peter Anvin last; 11912b31560SH. Peter Anvin } 12012b31560SH. Peter Anvin } 12112b31560SH. Peter Anvin 12212b31560SH. Peter Anvin if ($is_binary) { 12312b31560SH. Peter Anvin print STDERR "$name: $f: binary file\n"; 12412b31560SH. Peter Anvin next; 12512b31560SH. Peter Anvin } 12612b31560SH. Peter Anvin 12712b31560SH. Peter Anvin seek(FILE, 0, 0); 12812b31560SH. Peter Anvin 12912b31560SH. Peter Anvin $in_bytes = 0; 13012b31560SH. Peter Anvin $out_bytes = 0; 13112b31560SH. Peter Anvin $blank_bytes = 0; 13212b31560SH. Peter Anvin 13312b31560SH. Peter Anvin @blanks = (); 13412b31560SH. Peter Anvin @lines = (); 135*cb3ed5b7SH. Peter Anvin $lineno = 0; 13612b31560SH. Peter Anvin 13712b31560SH. Peter Anvin while ( defined($line = <FILE>) ) { 138*cb3ed5b7SH. Peter Anvin $lineno++; 13912b31560SH. Peter Anvin $in_bytes += length($line); 14012b31560SH. Peter Anvin $line =~ s/[ \t\r]*$//; # Remove trailing spaces 14112b31560SH. Peter Anvin $line = clean_space_tabs($line); 14212b31560SH. Peter Anvin 14312b31560SH. Peter Anvin if ( $line eq "\n" ) { 14412b31560SH. Peter Anvin push(@blanks, $line); 14512b31560SH. Peter Anvin $blank_bytes += length($line); 14612b31560SH. Peter Anvin } else { 14712b31560SH. Peter Anvin push(@lines, @blanks); 14812b31560SH. Peter Anvin $out_bytes += $blank_bytes; 14912b31560SH. Peter Anvin push(@lines, $line); 15012b31560SH. Peter Anvin $out_bytes += length($line); 15112b31560SH. Peter Anvin @blanks = (); 15212b31560SH. Peter Anvin $blank_bytes = 0; 15312b31560SH. Peter Anvin } 154*cb3ed5b7SH. Peter Anvin 155*cb3ed5b7SH. Peter Anvin $l_width = strwidth($line); 156*cb3ed5b7SH. Peter Anvin if ($max_width && $l_width > $max_width) { 157*cb3ed5b7SH. Peter Anvin print STDERR 158*cb3ed5b7SH. Peter Anvin "$f:$lineno: line exceeds $max_width characters ($l_width)\n"; 159*cb3ed5b7SH. Peter Anvin } 16012b31560SH. Peter Anvin } 16112b31560SH. Peter Anvin 16212b31560SH. Peter Anvin # Any blanks at the end of the file are discarded 16312b31560SH. Peter Anvin 16412b31560SH. Peter Anvin if ($in_bytes != $out_bytes) { 16512b31560SH. Peter Anvin # Only write to the file if changed 16612b31560SH. Peter Anvin seek(FILE, 0, 0); 16712b31560SH. Peter Anvin print FILE @lines; 16812b31560SH. Peter Anvin 16912b31560SH. Peter Anvin if ( !defined($where = tell(FILE)) || 17012b31560SH. Peter Anvin !truncate(FILE, $where) ) { 17112b31560SH. Peter Anvin die "$name: Failed to truncate modified file: $f: $!\n"; 17212b31560SH. Peter Anvin } 17312b31560SH. Peter Anvin } 17412b31560SH. Peter Anvin 17512b31560SH. Peter Anvin close(FILE); 17612b31560SH. Peter Anvin} 177