Perl Script to Find and Delete Duplicate Files for Backup Purposes

A useful script to remove duplicate files, especially designed for saving space for backup purposes.

What the program does; finds all files in a Source Directory, puts them into an array, then finds all files in a Target Directory, putting those into another array (edit accordingly). The arrays are compared for matching file names. Matches are then compared for file size. Matches in the Target are Deleted. Only files with exact matching names and size will be Deleted.

Special note: This is NOT a good script for simple duplicate file detection. Since the goal is to save space for backup purposes, only exact matches are deleted in the Target (we want to keep any edited files since the last backup). Therefore, as an example, images whose EXIF (tags) have been edited will evade detection as duplicates.

#!/usr/bin/perl

# Copyright 2014 - Troy Hartenstine http://www.hartenstine.com/
#
# http://www.seleads.com/perl-script-find-delete-duplicate-files/
#
# This script is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 3 of the License, or (at your option) any later
# version.
#
# This script is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details at http://www.gnu.org/licenses/.

use warnings;
	no warnings 'uninitialized';
use strict;
use Term::ANSIColor;
use Scalar::MoreUtils qw(empty);
use File::Compare;
use File::Find;
use File::stat;

$|=1; #Autoflush ON!

# SOURCE (edit)
my $folder_source = '/media/';

# TARGET (edit)
my $folder_target = '/home/';

my @source_array;
my $count1;
find ( \&wanted, $folder_source );
sub wanted {
	my $source_file_name =  $_;
	return if (-d $_);
	return if ($_ =~ m{^\.});
	my $source_directory_file_name = $File::Find::name;
	push (@source_array, [$source_file_name,$source_directory_file_name]);
	$count1++;
}

print color 'green';
my $runtime = (time - $^T)/60;
print "Source Directory took ", $runtime, " minutes to complete\n";
print "$count1 :: Source Files";
print color 'reset';
print "\n";

####### UNWANTED ############
my $count2;
my @target_array;
use File::Find;
find ( \&unwanted, $folder_target);
sub unwanted {
	my $target_file_name =  $_;
	return if (-d $_);
	return if ($_ =~ m{^\.});
	my $target_directory_file_name = $File::Find::name;
########EXAMPLE REGEX#########
	#return if ($target_directory_file_name =~ m{^\.});
	#return unless ($target_directory_file_name =~ m{Programs});
	#return if ($target_directory_file_name =~ m{Downloads});
	#return unless ($_ =~ m{\.jpg|\.png|\.bmp|\.mpg|\.mp4|\.avi|\.mov|\.mvg}i);
	#return unless ($_ =! m{\.jpg|\.png|\.bmp|\.mpg|\.mp4}i);
	#return if ($_ =~ m{\.jpg|\.png|\.bmp|\.mpg|\.mp4}i);
	#return if ($_ =! m{\.jpg|\.png|\.bmp|\.mpg|\.mp4}i);
##############################
	push (@target_array, [$target_file_name,$target_directory_file_name]);
	$count2++;
}
print color 'yellow';
$runtime = (time - $^T)/60;
warn "Target Directory took ", $runtime, " minutes to complete\n";
print "$count2 :: Target Files";
print color 'reset';
print "\n";

my $source_directory_file_name;
my $source_file_name;
my $source_file_hash;
my $source_array;
my $target_directory_file_name;
my $target_file_name;
my $target_file_hash;
my $target_array;
my $size = 0;
my $Msize = 0;
my $count3;
my $count4;
my $count5 = 0;
foreach $source_array  (@source_array) {
	next if(empty($source_array));
	$source_file_name  = $source_array->[0];
	chomp($source_file_name);
	next if(empty($source_file_name));
	next if(-d $source_file_name);
	$source_directory_file_name = $source_array->[1];
	chomp($source_directory_file_name);
	next if(-d $source_directory_file_name);
	next if(empty($source_directory_file_name));
	$count3++;
	foreach $target_array (@target_array) {
		next if(empty($target_array));
		$count4++;
		$target_file_name  = $target_array->[0];
		chomp($target_file_name);
		next if(empty($target_file_name));
		next if(-d $target_file_name);
		$target_directory_file_name = $target_array->[1];
		chomp($target_directory_file_name);
		next if(empty($target_directory_file_name));
		next if(-d $target_directory_file_name);
		if ( $target_file_name eq $source_file_name ) {
			next if ($target_directory_file_name eq $source_directory_file_name );
			print color 'white on_blue';
			print "$source_file_name ::: matched ::: $target_directory_file_name";
			print color 'reset';
			print "\n"; 
			if (compare( $source_directory_file_name , $target_directory_file_name ) == 0) {
				$size = stat($target_directory_file_name)->size;
				$Msize = ($Msize + ($size/1000000));					
				unlink($target_directory_file_name);
				$count5++;
				my $percent;
				$percent = $count3/$count1;
				print color 'white on_red';
				print "$count1 :: $count3 :: $percent :: $count4 :: $count5 :: $Msize Mb";
				print color 'white on_red';
				print color 'reset';
				print "\n";
				print color 'white on_red';
				print "DELETED :: $target_directory_file_name";
				print color 'reset';
				print "\n\n";
				}
				else {
				print color 'white on_green';
				print "$count1 :: $count2 :: $count3 :: $count4 :: $count5 :: NOT :: $target_directory_file_name";
				print color 'reset';
				print "\n\n";
				}
			}
		undef $target_directory_file_name;
		undef $target_file_name;
		}
	undef $source_directory_file_name;
	undef $source_file_name;
}

system("find $folder_target -size  0 -exec rm -Rfv '{}' \;");
system("find $folder_target -type d  -empty -exec rm -Rfv '{}' \;");

print color 'red';
$runtime = (time - $^T)/60;
print "Total runtime ", $runtime, " minutes \n";
print "$count4 :: Processed Files\n";
print "$count5 :: Deleted Files\n";
print "$Msize :: Megabytes Deleted";
print color 'reset';
print "\n";

exit 0;

__END__

You may also like...