1*cb77f0d6SKamil Rytarowski#!/usr/bin/env perl 212b31560SH. Peter Anvin# 312b31560SH. Peter Anvin# Clean a text file -- or directory of text files -- of stealth whitespace. 412b31560SH. Peter Anvin# WARNING: this can be a highly destructive operation. Use with caution. 512b31560SH. Peter Anvin# 612b31560SH. Peter Anvin 7*cb77f0d6SKamil Rytarowskiuse warnings; 812b31560SH. Peter Anvinuse bytes; 912b31560SH. Peter Anvinuse File::Basename; 1012b31560SH. Peter Anvin 11cb3ed5b7SH. Peter Anvin# Default options 12cb3ed5b7SH. Peter Anvin$max_width = 79; 13cb3ed5b7SH. Peter Anvin 1412b31560SH. Peter Anvin# Clean up space-tab sequences, either by removing spaces or 1512b31560SH. Peter Anvin# replacing them with tabs. 1612b31560SH. Peter Anvinsub clean_space_tabs($) 1712b31560SH. Peter Anvin{ 1812b31560SH. Peter Anvin no bytes; # Tab alignment depends on characters 1912b31560SH. Peter Anvin 2012b31560SH. Peter Anvin my($li) = @_; 2112b31560SH. Peter Anvin my($lo) = ''; 2212b31560SH. Peter Anvin my $pos = 0; 2312b31560SH. Peter Anvin my $nsp = 0; 2412b31560SH. Peter Anvin my($i, $c); 2512b31560SH. Peter Anvin 2612b31560SH. Peter Anvin for ($i = 0; $i < length($li); $i++) { 2712b31560SH. Peter Anvin $c = substr($li, $i, 1); 2812b31560SH. Peter Anvin if ($c eq "\t") { 2912b31560SH. Peter Anvin my $npos = ($pos+$nsp+8) & ~7; 3012b31560SH. Peter Anvin my $ntab = ($npos >> 3) - ($pos >> 3); 3112b31560SH. Peter Anvin $lo .= "\t" x $ntab; 3212b31560SH. Peter Anvin $pos = $npos; 3312b31560SH. Peter Anvin $nsp = 0; 3412b31560SH. Peter Anvin } elsif ($c eq "\n" || $c eq "\r") { 3512b31560SH. Peter Anvin $lo .= " " x $nsp; 3612b31560SH. Peter Anvin $pos += $nsp; 3712b31560SH. Peter Anvin $nsp = 0; 3812b31560SH. Peter Anvin $lo .= $c; 3912b31560SH. Peter Anvin $pos = 0; 4012b31560SH. Peter Anvin } elsif ($c eq " ") { 4112b31560SH. Peter Anvin $nsp++; 4212b31560SH. Peter Anvin } else { 4312b31560SH. Peter Anvin $lo .= " " x $nsp; 4412b31560SH. Peter Anvin $pos += $nsp; 4512b31560SH. Peter Anvin $nsp = 0; 4612b31560SH. Peter Anvin $lo .= $c; 4712b31560SH. Peter Anvin $pos++; 4812b31560SH. Peter Anvin } 4912b31560SH. Peter Anvin } 5012b31560SH. Peter Anvin $lo .= " " x $nsp; 5112b31560SH. Peter Anvin return $lo; 5212b31560SH. Peter Anvin} 5312b31560SH. Peter Anvin 54cb3ed5b7SH. Peter Anvin# Compute the visual width of a string 55cb3ed5b7SH. Peter Anvinsub strwidth($) { 56cb3ed5b7SH. Peter Anvin no bytes; # Tab alignment depends on characters 57cb3ed5b7SH. Peter Anvin 58cb3ed5b7SH. Peter Anvin my($li) = @_; 59cb3ed5b7SH. Peter Anvin my($c, $i); 60cb3ed5b7SH. Peter Anvin my $pos = 0; 61cb3ed5b7SH. Peter Anvin my $mlen = 0; 62cb3ed5b7SH. Peter Anvin 63cb3ed5b7SH. Peter Anvin for ($i = 0; $i < length($li); $i++) { 64cb3ed5b7SH. Peter Anvin $c = substr($li,$i,1); 65cb3ed5b7SH. Peter Anvin if ($c eq "\t") { 66cb3ed5b7SH. Peter Anvin $pos = ($pos+8) & ~7; 67cb3ed5b7SH. Peter Anvin } elsif ($c eq "\n") { 68cb3ed5b7SH. Peter Anvin $mlen = $pos if ($pos > $mlen); 69cb3ed5b7SH. Peter Anvin $pos = 0; 70cb3ed5b7SH. Peter Anvin } else { 71cb3ed5b7SH. Peter Anvin $pos++; 72cb3ed5b7SH. Peter Anvin } 73cb3ed5b7SH. Peter Anvin } 74cb3ed5b7SH. Peter Anvin 75cb3ed5b7SH. Peter Anvin $mlen = $pos if ($pos > $mlen); 76cb3ed5b7SH. Peter Anvin return $mlen; 77cb3ed5b7SH. Peter Anvin} 78cb3ed5b7SH. Peter Anvin 7912b31560SH. Peter Anvin$name = basename($0); 8012b31560SH. Peter Anvin 81cb3ed5b7SH. Peter Anvin@files = (); 82cb3ed5b7SH. Peter Anvin 83cb3ed5b7SH. Peter Anvinwhile (defined($a = shift(@ARGV))) { 84cb3ed5b7SH. Peter Anvin if ($a =~ /^-/) { 85cb3ed5b7SH. Peter Anvin if ($a eq '-width' || $a eq '-w') { 86cb3ed5b7SH. Peter Anvin $max_width = shift(@ARGV)+0; 87cb3ed5b7SH. Peter Anvin } else { 88cb3ed5b7SH. Peter Anvin print STDERR "Usage: $name [-width #] files...\n"; 89cb3ed5b7SH. Peter Anvin exit 1; 90cb3ed5b7SH. Peter Anvin } 91cb3ed5b7SH. Peter Anvin } else { 92cb3ed5b7SH. Peter Anvin push(@files, $a); 93cb3ed5b7SH. Peter Anvin } 94cb3ed5b7SH. Peter Anvin} 95cb3ed5b7SH. Peter Anvin 96cb3ed5b7SH. Peter Anvinforeach $f ( @files ) { 9712b31560SH. Peter Anvin print STDERR "$name: $f\n"; 9812b31560SH. Peter Anvin 9912b31560SH. Peter Anvin if (! -f $f) { 10012b31560SH. Peter Anvin print STDERR "$f: not a file\n"; 10112b31560SH. Peter Anvin next; 10212b31560SH. Peter Anvin } 10312b31560SH. Peter Anvin 10412b31560SH. Peter Anvin if (!open(FILE, '+<', $f)) { 10512b31560SH. Peter Anvin print STDERR "$name: Cannot open file: $f: $!\n"; 10612b31560SH. Peter Anvin next; 10712b31560SH. Peter Anvin } 10812b31560SH. Peter Anvin 10912b31560SH. Peter Anvin binmode FILE; 11012b31560SH. Peter Anvin 11112b31560SH. Peter Anvin # First, verify that it is not a binary file; consider any file 11212b31560SH. Peter Anvin # with a zero byte to be a binary file. Is there any better, or 11312b31560SH. Peter Anvin # additional, heuristic that should be applied? 11412b31560SH. Peter Anvin $is_binary = 0; 11512b31560SH. Peter Anvin 11612b31560SH. Peter Anvin while (read(FILE, $data, 65536) > 0) { 11712b31560SH. Peter Anvin if ($data =~ /\0/) { 11812b31560SH. Peter Anvin $is_binary = 1; 11912b31560SH. Peter Anvin last; 12012b31560SH. Peter Anvin } 12112b31560SH. Peter Anvin } 12212b31560SH. Peter Anvin 12312b31560SH. Peter Anvin if ($is_binary) { 12412b31560SH. Peter Anvin print STDERR "$name: $f: binary file\n"; 12512b31560SH. Peter Anvin next; 12612b31560SH. Peter Anvin } 12712b31560SH. Peter Anvin 12812b31560SH. Peter Anvin seek(FILE, 0, 0); 12912b31560SH. Peter Anvin 13012b31560SH. Peter Anvin $in_bytes = 0; 13112b31560SH. Peter Anvin $out_bytes = 0; 13212b31560SH. Peter Anvin $blank_bytes = 0; 13312b31560SH. Peter Anvin 13412b31560SH. Peter Anvin @blanks = (); 13512b31560SH. Peter Anvin @lines = (); 136cb3ed5b7SH. Peter Anvin $lineno = 0; 13712b31560SH. Peter Anvin 13812b31560SH. Peter Anvin while ( defined($line = <FILE>) ) { 139cb3ed5b7SH. Peter Anvin $lineno++; 14012b31560SH. Peter Anvin $in_bytes += length($line); 14112b31560SH. Peter Anvin $line =~ s/[ \t\r]*$//; # Remove trailing spaces 14212b31560SH. Peter Anvin $line = clean_space_tabs($line); 14312b31560SH. Peter Anvin 14412b31560SH. Peter Anvin if ( $line eq "\n" ) { 14512b31560SH. Peter Anvin push(@blanks, $line); 14612b31560SH. Peter Anvin $blank_bytes += length($line); 14712b31560SH. Peter Anvin } else { 14812b31560SH. Peter Anvin push(@lines, @blanks); 14912b31560SH. Peter Anvin $out_bytes += $blank_bytes; 15012b31560SH. Peter Anvin push(@lines, $line); 15112b31560SH. Peter Anvin $out_bytes += length($line); 15212b31560SH. Peter Anvin @blanks = (); 15312b31560SH. Peter Anvin $blank_bytes = 0; 15412b31560SH. Peter Anvin } 155cb3ed5b7SH. Peter Anvin 156cb3ed5b7SH. Peter Anvin $l_width = strwidth($line); 157cb3ed5b7SH. Peter Anvin if ($max_width && $l_width > $max_width) { 158cb3ed5b7SH. Peter Anvin print STDERR 159cb3ed5b7SH. Peter Anvin "$f:$lineno: line exceeds $max_width characters ($l_width)\n"; 160cb3ed5b7SH. Peter Anvin } 16112b31560SH. Peter Anvin } 16212b31560SH. Peter Anvin 16312b31560SH. Peter Anvin # Any blanks at the end of the file are discarded 16412b31560SH. Peter Anvin 16512b31560SH. Peter Anvin if ($in_bytes != $out_bytes) { 16612b31560SH. Peter Anvin # Only write to the file if changed 16712b31560SH. Peter Anvin seek(FILE, 0, 0); 16812b31560SH. Peter Anvin print FILE @lines; 16912b31560SH. Peter Anvin 17012b31560SH. Peter Anvin if ( !defined($where = tell(FILE)) || 17112b31560SH. Peter Anvin !truncate(FILE, $where) ) { 17212b31560SH. Peter Anvin die "$name: Failed to truncate modified file: $f: $!\n"; 17312b31560SH. Peter Anvin } 17412b31560SH. Peter Anvin } 17512b31560SH. Peter Anvin 17612b31560SH. Peter Anvin close(FILE); 17712b31560SH. Peter Anvin} 178