Generated: Tue Feb 2 17:54:58 2010 from tidyword.pl 2007/05/06 5.7 KB.
#!/perl -w # NAME: tidyword.pl # AIM: Take a tidied word filtered html and remove MsoNormal, MsoPlainText calls paragraphs, # That is no margin paragraphs separated by a 'blank' become 'normal' paragraphs, and # paragraphs following each other become one paragraph with <br> separating the lines # 06/05/2007 - geoff mclane - geoffmclane.com use strict; use warnings; require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp.'.$0.'.txt'; if ($0 =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$0); $outfile = 'temp.'.($tmpsp[-1]).'.txt'; } open_log($outfile); prt( "$0 ... Hello, World ...\n" ); my $def_file = "C:\\Documents and Settings\\Geoff McLane\\My Documents\\Louis\\tempout.htm"; my $in_file = $def_file; my $out_file = "tempnew.htm"; my $lncnt = 0; my @lines = (); my $line = ''; my $tag = ''; my $ch = ''; my $boff = 0; my $bln = 0; my $bpoff = 0; my $bpln = 0; my $epoff = 0; my $epln = 0; my @paras = (); my $inpara = 0; # debug switches my $dbg1 = 0; # show line collection if (open INF, "<$in_file") { @lines = <INF>; close INF; } $lncnt = scalar @lines; prt( "Processing $lncnt lines from $in_file ...\n" ); for (my $i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; my $lnlen = length($line); for (my $j = 0; $j < $lnlen; $j++) { $ch = substr($line,$j,1); if ($ch eq '<') { if (length($tag)) { # deal with last tag } $tag = $ch; $boff = $j; $bln = $i; $j++; for ( ; $j < $lnlen; $j++) { $ch = substr($line,$j,1); $tag .= $ch; if ($ch eq '>') { # end of tag if ($tag =~ /^<p\s+(.*)>/i) { prt( "$tag [$1] line $i:$boff\n" ) if ($dbg1); $bpoff = $boff; $bpln = $bln; prt( "WARNING: Already in paragraph!\n" ) if ($inpara); $inpara = 1; } elsif ($tag =~ /^<\/p>/) { prt( "$tag CLOSED line $i:$j para: $bpln:$bpoff to $i:$j\n" ) if ($dbg1); push(@paras, [$bpln, $bpoff, $i, $j, 1, "content"] ); $inpara = 0; } last; } } } } } my $pcnt = scalar @paras; prt( "Looking at $pcnt paragraphs ...\n" ); for (my $i = 0; $i < $pcnt; $i++) { #push(@paras, [$bpln, $bpoff, $i, $j, 1, "content"] ); $bpln = $paras[$i][0]; $bpoff = $paras[$i][1]; $epln = $paras[$i][2]; $epoff = $paras[$i][3]; prt( "Paragraph: $bpln:$bpoff to $epln:$epoff\n" ) if ($dbg1); $line = getpara( $bpln, $bpoff, $epln, $epoff, @lines ); my $ln2 = getcontent( $line ); my $res = ($ln2 =~ /\S/); if ($res) { prt( "content: $ln2 [$bpln, $bpoff, $epln, $epoff]\n" ); $paras[$i][4] = length($ln2); $paras[$i][5] = $ln2; } else { prt( "$line (BLANK) [$bpln, $bpoff, $epln, $epoff]\n" ); $paras[$i][4] = 0; $paras[$i][5] = ""; } } open OUT, ">$out_file" or mydie( "ERROR: Unable to create $out_file ... $! ...\n" ); my $lastbr = 0; for (my $i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; my $lnlen = length($line); my $ln2 = ''; my $flg = 0; my $flg2 = 0; my $endp = ''; my $i2 = lineinparas($i); $bpln = 0; if ($i2 < $pcnt) { $bpln = $paras[$i2][0]; $bpoff = $paras[$i2][1]; $epln = $paras[$i2][2]; $epoff = $paras[$i2][3]; $flg = $paras[$i2][4]; $ln2 = $paras[$i2][5]; # deal with substitution ... $flg2 = 0; if (($i2 + 1) < $pcnt) { $flg2 = $paras[$i2+1][4]; } if ($flg) { if ($flg2) { $endp = '<br>'; } else { $endp = '</p>'; } } else { $endp = 'KILL'; } if ($bpln == $epln) { if ($flg) { prt( "DEAL WITH LINE $i ...[$bpln, $bpoff, $epln, $epoff]\n<p>$ln2$endp ($lastbr)\n" ); if ($lastbr == 0) { print OUT "<p>"; } print OUT $ln2.$endp; if ($endp =~ /<br>/) { $lastbr = 1; } else { $lastbr = 0; } } else { prt( "KILL LINE $i ...[$bpln, $bpoff, $epln, $epoff] ($ln2) ($lastbr)\n" ); } } else { if ($flg) { if ($lastbr == 0) { print OUT "<p>"; } print OUT $ln2.$endp; prt( "DEAL WITH LINES $i-$epln ...[$bpln, $bpoff, $epln, $epoff]\n$ln2$endp\n ($lastbr)" ); if ($endp =~ /<br>/) { $lastbr = 1; } else { $lastbr = 0; } } else { prt( "KILL LINE $i ...[$bpln, $bpoff, $epln, $epoff] ($ln2) ($lastbr)\n" ); } } if ($epln > $bpln) { $i = $epln; } } else { print OUT $line; chomp $line; prt( "$line ($i)\n" ); } } close OUT; close_log($outfile,1); exit(0); sub lineinparas { my ($il) = shift; for (my $j1 = 0; $j1 < $pcnt; $j1++) { my $pl1 = $paras[$j1][0]; my $pl2 = $paras[$j1][2]; if ($pl1 == $il) { return $j1; # found this LINE } elsif ($pl1 > $il ) { last; # reached a line GREATER } if ($pl2 > $pl1) { if ($il > $pl1) { if ($pl2 >= $il) { return $j1; # found this LINE } } } # continue while para line LT given line } return $pcnt + 1; } sub getcontent { my ($ln) = shift; if ($ln =~ /^<p\s+.*>(.*)<\/p>/) { $ln = $1; } else { my $c = ''; my $i = 0; my $nln = ''; my $len = length($ln); for ($i = 0; $i < $len; $i++) { $c = substr($ln,$i,1); if ($c eq '>') { $i++; last; } } if ($c eq '>') { for (; $i < $len; $i++) { $c = substr($ln,$i,1); if ($c eq '<') { last; } $nln .= $c; } } $ln = $nln if length($nln); } $ln =~ s/ / /g; return $ln; } sub getpara { my ( $bpl, $bpo, $epl, $epo, @lns ) = @_; my $ln = $lns[$bpl]; my $ll = length($ln); if ($bpo) { $ln = substr($ln,$bpo); } if ($bpl == $epl) { $ln = substr($ln,0, $epo - $bpo + 1); } else { while( $bpl < $epl ) { $bpl++; my $ln2 = $lns[$bpl]; if ($bpl == $epl) { $ln2 = substr($ln2, 0, $epo + 1); } $ln .= $ln2; } } return $ln; } # eof