#!/perl -w # NAME: chkdupes.pl # AIM: Read a folder, and subfolders, and check for any duplicate file names # This is so they can all be put in one folder, if possible # 07/07/2013 - Tied to reduce the 'duplications' but seem to have failed # 01/02/2013 - Hopefully fix to run in linux # 19/11/2011 - Allow first item of two to be a single file # 18/11/2011 - Fix bug if two folder given - src and dest # 15/08/2011 - Update... # 22/07/2008 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::stat; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.4 2013-07-07"; #my $VERS = "0.0.3 2011-11-19"; my $load_log = 0; my $in_file = ''; my $show_ext = 0; my $max_name_wid = 65; my $debug_on = 0; my $def_file = 'def_file'; my $repos = ".git;.svn;.hg;CVS"; my $do_all_ext = 0; my $headers_only = 1; my $def_folder = "C:\\Users\\Public\\SAVES\\peru\\My Pictures\\Carla"; # DEFAULT, if NO command input my $in_folder = ""; my @folder_list = (); my @file_list = (); my %exthash = (); my @dup_list = (); my @exl_dirs = (); my @excludes_files = (); my $verbose = 0; ### program variables my @warnings = (); my $cwd = cwd(); ### forward sub process_folder($); sub scan_dir($$$); sub VERB1() { return ($verbose >= 1); } sub VERB2() { return ($verbose >= 2); } sub VERB5() { return ($verbose >= 5); } sub VERB9() { return ($verbose >= 9); } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { ###prt( "\nNo warnings issued.\n\n" ); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub in_exclude_dirs($) { my $dir = shift; my ($xcl); foreach $xcl (@exl_dirs) { if ($dir eq $xcl) { return 1; } } return 0; } sub is_excluded_file($) { my $file = shift; my $lcfl = lc($file); my ($tfl,$lctf); foreach $tfl (@excludes_files) { $lctf = lc($tfl); return 1 if ($lcfl eq $lctf); } return 0; } sub is_right_type($) { my $file = shift; return 1 if ($do_all_ext); return 1 if (is_h_source($file)); if (is_c_source($file)) { if ($headers_only) { return 0; } else { return 1; } } return 0; } sub process_folder($) { my ($inf) = shift; my @subdirs = (); my ($file,$name); if (opendir( DIR, $inf)) { my @files = readdir(DIR); closedir DIR; foreach $file (@files) { if (($file eq '.')||($file eq '..')) { next; } my $ff = $inf . $PATH_SEP . $file; if (-d $ff) { if (!in_exclude_dirs($file)) { push(@subdirs,$ff); } } else { next if (is_excluded_file($file)); next if (!is_right_type($file)); my ($nm,$dir,$ext) = fileparse( $ff, qr/\.[^.]*/ ); my $sb = stat($ff); #my ($nm,$dir) = fileparse( $ff ); $nm = lc($nm); $ext = lc($ext); $name = $nm.$ext; # 0 1 2 3 4 5 push( @file_list, [$ff, $name, 0, 0, \$sb, 0]); if (defined $exthash{$ext}) { $exthash{$ext} ++; } else { $exthash{$ext} = 1; } } } foreach my $fil (@subdirs) { process_folder($fil); } } else { prt( "ERROR: Can NOT open $inf ... $! ... \n" ); } } sub do_one_folder() { # get all the files... process_folder($folder_list[0]); my $incnt = scalar @file_list; prt( "Got $incnt file items to check ...\n" ); my $dup_cnt = 0; my ($ff1,$ff2,$i,$j,$name,$sb1,$sb2,$min,$len,$nn1,$nn2,$tm1,$tm2); # 0 1 2 3 4 5 #push( @file_list, [$ff, $name, 0, 0, \$sb, 0]); for ($i = 0; $i < $incnt; $i++) { $file_list[$i][2] = 0; # clear all counts $file_list[$i][3] = 0; # clear all matches $file_list[$i][5] = 0; # clear all DONE } for ($i = 0; $i < $incnt; $i++) { next if ($file_list[$i][5] > 0); $name = $file_list[$i][1]; for ($j = 0; $j < $incnt; $j++) { next if ($file_list[$j][5] > 0); # skip if DONE next if ($i == $j); if ($name eq $file_list[$j][1]) { $file_list[$j][2]++; $file_list[$i][2]++; $file_list[$j][3] = $i; $file_list[$i][3] = $j; $file_list[$j][5] = 1; # mark as DONE } } $file_list[$i][5] = 1; # mark as DONE } $dup_cnt = 0; for ($i = 0; $i < $incnt; $i++) { $name = $file_list[$i][1]; if ($file_list[$i][2] > 0) { $dup_cnt++; } } prt( "Got $dup_cnt duplicate names...\n" ); if (!VERB1()) { prt("Add -v1 to produces a list. v5 to show details.\n"); } $min = 0; for ($i = 0; $i < $incnt; $i++) { $name = $file_list[$i][1]; if ($file_list[$i][2] > 0) { $j = $file_list[$i][3]; $ff1 = $file_list[$i][0]; $ff2 = $file_list[$j][0]; $len = length($ff1); $min = $len if ($len > $min); $len = length($ff2); $min = $len if ($len > $min); } } $min = $max_name_wid if ($min > $max_name_wid); for ($i = 0; $i < $incnt; $i++) { $file_list[$i][5] = 0; # clear SHOWN } $dup_cnt = 0; for ($i = 0; $i < $incnt; $i++) { $name = $file_list[$i][1]; next if ($file_list[$i][5] > 0); if ($file_list[$i][2] > 0) { $j = $file_list[$i][3]; $ff1 = $file_list[$i][0]; $ff2 = $file_list[$j][0]; #prt( "Dupe $name ...\n" ); if (VERB5()) { $sb1 = stat($ff1); $sb2 = stat($ff2); $nn1 = get_nn($sb1->size); $nn2 = get_nn($sb2->size); $tm1 = lu_get_YYYYMMDD_hhmmss($sb1->mtime); $tm2 = lu_get_YYYYMMDD_hhmmss($sb2->mtime); $ff1 .= ' ' while (length($ff1) < $min); $ff2 .= ' ' while (length($ff2) < $min); $nn1 = ' '.$nn1 while (length($nn1) < 12); $nn2 = ' '.$nn2 while (length($nn2) < 12); prt("$ff1 $nn1 $tm1\n"); prt("$ff2 $nn2 $tm1\n"); } elsif (VERB2()) { $ff1 = $file_list[$i][0]; $sb1 = stat($ff1); $nn1 = get_nn($sb1->size); $tm1 = lu_get_YYYYMMDD_hhmmss($sb1->mtime); $ff1 .= ' ' while (length($ff1) < $min); $nn1 = ' '.$nn1 while (length($nn1) < 12); prt("$ff1 $nn1 $tm1\n"); } elsif (VERB1()) { $ff1 .= ' ' while (length($ff1) < $min); $ff2 .= ' ' while (length($ff2) < $min); prt( "$ff1 == $ff2\n" ); } $file_list[$i][5] = 1; # set SHOWN $file_list[$j][5] = 1; # set SHOWN $dup_cnt++; } } prt( "Done $dup_cnt duplicate names...\n" ) if (VERB1()); } sub scan_dir($$$) { my ($ra,$inf,$lev) = @_; pgm_exit(1,"ERROR: scan_dir: Passed null value!\n") if (length($inf) == 0); my @subdirs = (); my ($file,$ff,$name); prt("Scanning [$inf]...\n") if ($lev == 0); if (opendir( DIR, $inf)) { my @files = readdir(DIR); closedir DIR; $inf .= "\\" if ( !($inf =~ /(\\|\/)$/) ); foreach $file (@files) { next if (($file eq '.')||($file eq '..')); $ff = $inf.$file; if (-d $ff) { push(@subdirs,$ff); } else { my ($nm,$dir,$ext) = fileparse( $ff, qr/\.[^.]*/ ); #my ($nm,$dir) = fileparse( $ff ); $nm = lc($nm); $ext = lc($ext); $name = $nm.$ext; # 0 1 2 3 push( @{$ra}, [$file, $ff, $name, 0] ); if (defined $exthash{$ext}) { $exthash{$ext} ++; } else { $exthash{$ext} = 1; } } } foreach my $fil (@subdirs) { scan_dir($ra,$fil,$lev+1); } } else { pgm_exit(1,"ERROR: Can NOT open [$inf] ... $! ... \n" ); } } sub compare_lists($$) { my ($ra1,$ra2) = @_; # = \@arr1,\@arr2 my $cnt1 = scalar @{$ra1}; my $cnt2 = scalar @{$ra2}; prt("Comparing list 1 = $cnt1, with list 2 = $cnt2...\n"); my ($fil1,$fil2,$nm1,$nm2,$fnd,$i,$j,$min,$len); # 0 1 2 3 #push( @{$ra}, [$file, $ff, $name, 0] ); $min = 0; for ($i = 0; $i < $cnt1; $i++) { $fil1 = ${$ra1}[$i][0]; $len = length($fil1); $min = $len if ($len > $min); } for ($i = 0; $i < $cnt1; $i++) { $fil1 = ${$ra1}[$i][0]; $nm1 = ${$ra1}[$i][2]; $fnd = 0; for ($j = 0; $j < $cnt2; $j++) { $fil2 = ${$ra2}[$j][0]; $nm2 = ${$ra2}[$j][2]; if ($nm1 eq $nm2) { $fnd = 1; last; } } $fil1 .= ' ' while (length($fil1) < $min); if ($fnd) { prtw("File $fil1 is DUPLICATED in list 2!\n"); } else { prt("File $fil1 NOT found in list 2!\n"); } } #prt("WARNING: Coding NOT completed!\n"); } sub show_extensions() { my $cnt = scalar keys(%exthash); prt("Got list of $cnt entensions...\n"); foreach my $key (keys %exthash) { prt( "Extension $key occurs ".$exthash{$key}." times ...\n" ); } prt("Done list of $cnt entensions...\n"); } parse_args(@ARGV); ###prt( "$0 ... Processing $in_folder ...\n" ); if (scalar @folder_list == 1) { do_one_folder(); # check folder for duplicate names??? } elsif (scalar @folder_list == 2) { my (@arr1,@arr2); my $f1 = $folder_list[0]; my $f2 = $folder_list[1]; if (-f $f1) { # 0 1 2 3 #push( @{$ra}, [$file, $ff, $name, 0] ); my ($nm,$dr) = fileparse($f1); push(@arr1, [$nm, $f1, lc($nm),0]); } elsif (-d $f1) { scan_dir(\@arr1,$f1,0); } else { pgm_exit(1,"First item is neither file, nor folder [$f1]!\n"); } scan_dir(\@arr2,$f2,0); compare_lists(\@arr1,\@arr2); } show_extensions() if ($show_ext); pgm_exit(0,""); ################################ sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-folder/in-file [in-folder2]\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --load (-l) = Load log at end.\n"); prt(" --show (-s) = Also show extension list.\n"); prt(" --verb (-v) = Bump verbosity.\n"); prt(" --dir (-d) = Exclude this directory.\n"); prt(" --xclude (-x) = Exclude duplicates of these names.\n"); prt(" --XCLUDE (-X) = Exclude repos folders (def=$repos)\n"); prt("If just ONE directory given, then it will be checked for duplicate files.\n"); prt("If TWO folders given, they will be compared, and duplicate files reported.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg,$cnt); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log $load_log\n") if (VERB1()); } elsif ($sarg =~ /^s/) { $show_ext = 1; } elsif ($sarg =~ /^v/i) { if ($sarg =~ /^v(\d+)$/) { $verbose = $1; } else { while ($sarg =~ /^v/) { $verbose++; $sarg = substr($sarg,1); } } prt("Set verbosity to $verbose\n") if (VERB1()); } elsif ($sarg =~ /^d/) { need_arg(@av); shift @av; $sarg = $av[0]; push(@exl_dirs,split(";",$sarg)); prt("Exclude directory $sarg\n") if (VERB1()); } elsif ($sarg =~ /^x/) { need_arg(@av); shift @av; $sarg = $av[0]; push(@excludes_files,split(';',$sarg)); prt("Exclude files $sarg\n") if (VERB1()); } elsif ($sarg =~ /^X/) { push(@exl_dirs,split(";",$repos)); prt("Exclude repo directories $repos\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; if ((-d $in_file)||(-f $in_file)) { push(@folder_list,$in_file); $cnt = scalar @folder_list; if ($cnt > 2) { pgm_exit(1,"ERROR: Can only give two folders. folder [$in_file] is 3rd!\n"); } prt("Set input $cnt to [$in_file]\n"); } else { pgm_exit(1,"ERROR: Can NOT locate folder [$in_file]!\n"); } } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input directory found in command!\n"); } #if (! -f $in_file) { # pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); #} } # eof - chkdupes.pl