1123 lines
		
	
	
		
			27 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			1123 lines
		
	
	
		
			27 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/perl -w
 | |
| my $Id = '';
 | |
| #
 | |
| # findimagedupes - Finds visually similar or duplicate images
 | |
| #
 | |
| # Copyright © 2006-2022 by Jonathan H N Chin <code@jhnc.org>.
 | |
| #
 | |
| # This program is free software; you may redistribute it and/or modify
 | |
| # it under the terms of the GNU General Public License as published by
 | |
| # the Free Software Foundation; either version 3 of the License, or
 | |
| # (at your option) any later version.
 | |
| #
 | |
| # This program is distributed in the hope that it will be useful,
 | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| # GNU General Public License for more details.
 | |
| #
 | |
| # You should have received a copy of the GNU General Public License
 | |
| # along with this program; if not, write to the Free Software
 | |
| # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
| #
 | |
| 
 | |
| use strict;
 | |
| 
 | |
| require 5.006_001;
 | |
| 
 | |
| use Cwd qw(realpath);
 | |
| use DB_File;
 | |
| use Digest::MD5 qw(md5_hex);
 | |
| use Getopt::Long qw(:config no_ignore_case require_order);
 | |
| use File::MimeInfo::Magic;
 | |
| use File::Temp qw(tempdir tempfile);
 | |
| use Graphics::Magick;
 | |
| use MIME::Base64;
 | |
| use Pod::Usage;
 | |
| 
 | |
| use Inline
 | |
| 	C => 'DATA',
 | |
| 	NAME => 'findimagedupes',
 | |
| 	DIRECTORY => '/tmp';
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| #
 | |
| # option parsing
 | |
| #
 | |
| 
 | |
| use vars qw(
 | |
| 	$null
 | |
| 	$add
 | |
| 	$collection
 | |
| 	@debug %debug
 | |
| 	@fpdb
 | |
| 	$merge
 | |
| 	$mergeFile
 | |
| 	$nocompare
 | |
| 	$program
 | |
| 	$quiet
 | |
| 	$rescan
 | |
| 	$recurse
 | |
| 	$script
 | |
| 	@scriptCode
 | |
| 	$scriptFile
 | |
| 	$threshold
 | |
| 	@verbosity
 | |
| 	$prune
 | |
| );
 | |
| 
 | |
| $add = 0;
 | |
| $quiet = 0;
 | |
| $threshold = '90%';
 | |
| 
 | |
| my %opt;
 | |
| GetOptions(
 | |
| 	'0|null'                 => \$null,
 | |
| 	'a|add'                  => \$add,
 | |
| 	'c|collection=s'         => sub { ($collection = $_[1]) =~ s/([.]gqv)?$/.gqv/ },
 | |
| 	'd|debug=s'              => \@debug,
 | |
| 	'f|fingerprints|fp|db=s' => \@fpdb,
 | |
| 	'h|?|help'               => sub { pod2usage(-verbose => 1) },
 | |
| 	'man'                    => sub { pod2usage(-verbose => 2) },
 | |
| 	'M|merge=s'              => sub { $merge = 1; $mergeFile = $_[1]; },
 | |
| 	'n|no-compare'           => \$nocompare,
 | |
| 	'P|prune'                => \$prune,
 | |
| 	'p|program=s'            => \$program,
 | |
| 	'q|quiet+'               => \$quiet,
 | |
| 	'R|recurse'              => \$recurse,
 | |
| 	'r|rescan'               => \$rescan,
 | |
| 	's|script=s'             => \$script,
 | |
| 	't|threshold=s'          => \$threshold,
 | |
| 	'v|verbosity=s'          => \@verbosity,
 | |
| 	'version'                => sub { print "findimagedupes $Id\n"; exit(0); },
 | |
| 
 | |
| 	'i|include=s'            => \@scriptCode,
 | |
| 	'I|include-file=s'       => \$scriptFile,
 | |
| ) or pod2usage(-verbose => 0);
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| 
 | |
| my @errors = ();
 | |
| my @warnings = ();
 | |
| 
 | |
| sub exitvalue {
 | |
| 	return(2) if @errors;
 | |
| 	return(1) if @warnings;
 | |
| 	return(0);
 | |
| }
 | |
| 
 | |
| sub mkerr { push @errors, join("", @_); }
 | |
| sub mkwarn { push @warnings, join("", @_); }
 | |
| 
 | |
| sub nqprint { print(@_) unless $quiet; }
 | |
| sub nqwarn { warn("Warning: ", @_) unless $quiet; }
 | |
| sub nqdie { nqwarn(@_); die; }
 | |
| sub nqdie2 { warn("Error: ", @_) if $quiet<2; die; }
 | |
| sub nqexit { warn("Error: ", @_) if $quiet<2; exit(3); }
 | |
| 
 | |
| my $inFP = 0;
 | |
| $SIG{SEGV} = sub { die $inFP ? "caught segfault" : ()};
 | |
| 
 | |
| sub printScriptFile {
 | |
| 	print("# BEGIN USER FILE INCLUDE\n");
 | |
| 	open(USERSCRIPT, "< $scriptFile") or die("$!\n");
 | |
| 	while (<USERSCRIPT>) {
 | |
| 		chomp;
 | |
| 		print "$_\n";
 | |
| 	}
 | |
| 	close(USERSCRIPT);
 | |
| 	print("# END USER FILE INCLUDE\n\n");
 | |
| }
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| #
 | |
| # setup
 | |
| #
 | |
| 
 | |
| my ($verb_fp, $verb_md5);
 | |
| 
 | |
| my $read_input = grep(/^-$/, @ARGV);
 | |
| 
 | |
| # XXX: can we tie these to save memory without breaking hv_iterinit() ?
 | |
| my (%fpcache, %filelist);
 | |
| 
 | |
| my $image = Graphics::Magick->new;
 | |
| 
 | |
| for (@debug) { $debug{$_} = 1 }
 | |
| 
 | |
| for (@scriptCode) { chomp; $_ .= "\n" }
 | |
| 
 | |
| if (@scriptCode) {
 | |
| 	unshift(@scriptCode, "# BEGIN USER CODE INCLUDE\n" );
 | |
| 	push(@scriptCode, "# END USER CODE INCLUDE\n\n" );
 | |
| }
 | |
| 
 | |
| my $scriptHeader = <<'EOD';
 | |
| #!/bin/sh
 | |
| 
 | |
| VIEW(){
 | |
| 	echo "$@"
 | |
| }
 | |
| END(){
 | |
| 	:;
 | |
| }
 | |
| 
 | |
| EOD
 | |
| 
 | |
| my $collectionHeader = <<'EOD';
 | |
| #GQview collection
 | |
| #created with findimagedupes
 | |
| # sort: manual
 | |
| EOD
 | |
| 
 | |
| my $collectionFooter = <<'EOD';
 | |
| #end
 | |
| EOD
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| #
 | |
| # sanity checks
 | |
| #
 | |
| 
 | |
| # +----------+
 | |
| # | warnings |
 | |
| # +----------+
 | |
| 
 | |
| if ($read_input>1) {
 | |
| 	mkwarn("extra occurrences of \"-\" will be ignored");
 | |
| }
 | |
| 
 | |
| if ($null and !$read_input) {
 | |
| 	mkwarn("--null has no effect in this context");
 | |
| }
 | |
| 
 | |
| if ($prune and !@fpdb) {
 | |
| 	mkwarn("--prune has no effect in this context");
 | |
| }
 | |
| 
 | |
| if ($nocompare) {
 | |
| 	mkwarn("--program ignored because --no-compare given") if $program;
 | |
| 	mkwarn("--script ignored because --no-compare given") if $script;
 | |
| }
 | |
| 
 | |
| if ($merge and $mergeFile eq '/dev/null') {
 | |
| 	$mergeFile = undef; # use DB_File memory database
 | |
| 	mkwarn("merge database is temporary");
 | |
| }
 | |
| 
 | |
| if ($prune and $merge and !defined($mergeFile)) {
 | |
| 	mkwarn("--prune is pointless when --merge database is temporary");
 | |
| }
 | |
| 
 | |
| if (@warnings and !$quiet) {
 | |
| 	warn( join("\n", map {"Warning: $_"} @warnings), "\n" );
 | |
| }
 | |
| 
 | |
| # +--------+
 | |
| # | errors |
 | |
| # +--------+
 | |
| 
 | |
| if ($collection and -e($collection)) {
 | |
| 	mkerr("Output file for --collection exists: $collection");
 | |
| }
 | |
| 
 | |
| for (@fpdb) {
 | |
| 	mkerr("File for --fingerprints does not exist: $_")
 | |
| 		unless (-f($_) or (@fpdb==1 and !$merge));
 | |
| }
 | |
| 
 | |
| if (@fpdb>1 and !$merge) {
 | |
| 	mkerr("Require --merge if using multiple fingerprint databases");
 | |
| }
 | |
| 
 | |
| if ($merge and defined($mergeFile) and -e($mergeFile)) {
 | |
| 	mkerr("Output file for --merge exists: $mergeFile");
 | |
| }
 | |
| 
 | |
| if ($program) {
 | |
| 	if (! -e($program)) {
 | |
| 		mkerr("File for --program does not exist: $program");
 | |
| 	}
 | |
| 	elsif (! -x($program)) {
 | |
| 		mkerr("File for --program not executable: $program");
 | |
| 	}
 | |
| }
 | |
| 
 | |
| if ($script and -e($script)) {
 | |
| 	mkerr("Output file for --script exists: $script");
 | |
| }
 | |
| 
 | |
| if ($scriptFile) {
 | |
| 	if (! -f($scriptFile)) {
 | |
| 		mkerr("File for --include-file does not exist: $scriptFile");
 | |
| 	}
 | |
| 	elsif (! -r($scriptFile)) {
 | |
| 		mkerr("File for --include-file is not readable: $scriptFile");
 | |
| 	}
 | |
| }
 | |
| 
 | |
| if (my ($thres_val, $thres_unit) = $threshold =~ m/^\s*(\d+(?:[.]\d+)?)\s*([%b]?)\s*$/) {
 | |
| 	if ($thres_unit eq '' or $thres_unit eq '%') {
 | |
| 		# percentage to bits
 | |
| 		$threshold = int(2.56 * (100 - $thres_val));
 | |
| 	}
 | |
| 	elsif ($thres_unit =~ m/^b/i) {
 | |
| 		# already in bits
 | |
| 		$threshold = int($thres_val);
 | |
| 	}
 | |
| }
 | |
| else {
 | |
| 	# error
 | |
| 	$threshold = -1;
 | |
| }
 | |
| if ($threshold>256 or $threshold<0) {
 | |
| 	mkerr("--threshold takes values between 0.0% .. 100.0% or 0b .. 256b");
 | |
| }
 | |
| 
 | |
| for (split(",", join(",", @verbosity))) {
 | |
| 	/^(fingerprint|fp)$/ && do { $verb_fp = 1;  next; };
 | |
| 	/^md5$/              && do { $verb_md5 = 1; next; };
 | |
| 	mkerr("unknown option to --verbosity: $_");
 | |
| }
 | |
| 
 | |
| if (@errors) {
 | |
| 	exit(exitvalue()) unless $quiet<2;
 | |
| 	pod2usage(
 | |
| 		-verbose => 0,
 | |
| 		-exitval => exitvalue(),
 | |
| 		-msg => join("\n", map {"Error: $_"} @errors),
 | |
| 	);
 | |
| }
 | |
| 
 | |
| # +-------------------------------------------------+
 | |
| # | last chance to abort without altering any files |
 | |
| # +-------------------------------------------------+
 | |
| 
 | |
| unless (@ARGV>0 or @fpdb or $merge) {
 | |
| 	exit(exitvalue()) unless $quiet<2;
 | |
| 	warn("Nothing to do!\n") unless @warnings or $quiet;
 | |
| 	pod2usage(
 | |
| 		-verbose => 0,
 | |
| 		-exitval => exitvalue(),
 | |
| 	);
 | |
| }
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| #
 | |
| # load fingerprint cache
 | |
| #
 | |
| 
 | |
| my @regen = ();
 | |
| 
 | |
| for my $db (@fpdb) {
 | |
| 	my %data;
 | |
| 	tie(%data, 'DB_File', $db) or nqexit("tie($db): $!\n");
 | |
| 	while (my ($file, $fp) = each %data) {
 | |
| 		next if ($prune && !-f($file));
 | |
| 
 | |
| 		if (exists $fpcache{$file}) {
 | |
| 			if ($fpcache{$file} ne $fp) {
 | |
| 				# fingerprint mismatch, force regeneration
 | |
| 				push @regen, $file;
 | |
| 				delete $fpcache{$file};
 | |
| 			}
 | |
| 		}
 | |
| 		else {
 | |
| 			$fpcache{$file} = $fp;
 | |
| 		}
 | |
| 	}
 | |
| 	untie(%data);
 | |
| }
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| #
 | |
| # build file list
 | |
| #
 | |
| 
 | |
| my %mergelist;
 | |
| my $rw = 0;
 | |
| 
 | |
| if ($merge) {
 | |
| 	tie(%mergelist, 'DB_File', $mergeFile) or nqexit("tie($merge): $!\n");
 | |
| 	%mergelist = %fpcache;
 | |
| 	$rw = 1;
 | |
| }
 | |
| elsif (@fpdb==1) {
 | |
| 	tie(%mergelist, 'DB_File', $fpdb[0]) or nqexit("tie($fpdb[0]): $!\n");
 | |
| 	%mergelist = %fpcache if $prune;
 | |
| 	$rw = 1;
 | |
| }
 | |
| 
 | |
| $| = 1;
 | |
| $/ = "\0" if $null;
 | |
| 
 | |
| for (@ARGV ? @ARGV : @regen) {
 | |
| 	classify($_);
 | |
| }
 | |
| 
 | |
| untie(%mergelist);
 | |
| 
 | |
| finddupes() unless $nocompare;
 | |
| 
 | |
| exit 0;
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| 
 | |
| sub process_file {
 | |
| 	my ($path) = @_;
 | |
| 	my $file = realpath($path); # normalize to absolute canonical path
 | |
| 	if (!$file) {
 | |
| 		nqwarn("skipping bogus file: $path\n");
 | |
| 	}
 | |
| 	else {
 | |
| 		my $fp;
 | |
| 		if ($rescan or !exists $fpcache{$file}) {
 | |
| 			$fp = fingerprint($file);
 | |
| 		}
 | |
| 		elsif ($add and exists $fpcache{$file}) {
 | |
| 			$fp = $fpcache{$file};
 | |
| 		}
 | |
| 		if ($fp) {
 | |
| 			$filelist{$file} = $fp;
 | |
| 			delete $fpcache{$file};
 | |
| 			$mergelist{$file} = $filelist{$file} if $rw;
 | |
| 		}
 | |
| 
 | |
| 		if ($verb_fp) {
 | |
| 			my $fp = ( $filelist{$file} || $fpcache{$file} );
 | |
| 			if ($fp) {
 | |
| 				print(encode_base64($fp, ""), "  $file\n");
 | |
| 			}
 | |
| 			else {
 | |
| 				nqwarn("can't get fingerprint: $file\n");
 | |
| 			}
 | |
| 		}
 | |
| 		if ($verb_md5) {
 | |
| 			open(FILE, $file) or nqdie2("open($file): $!\n");
 | |
| 			binmode(FILE);
 | |
| 			my $digest = Digest::MD5->new->addfile(*FILE)->hexdigest;
 | |
| 			if ($digest) {
 | |
| 				print("$digest  $file\n");
 | |
| 			}
 | |
| 			else {
 | |
| 				nqwarn("can't get md5sum: $file\n");
 | |
| 			}
 | |
| 			close(FILE);
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| # XXX: This function is complicated by two things:
 | |
| #	- Historically we didn't recurse and it would be nice to
 | |
| #		retain nonrecursion as an option.
 | |
| #	- We need to process "." and ".." when they are given
 | |
| #		explicitly on the command line.
 | |
| #	Perhaps we could cleanup the code by rewriting using
 | |
| #		something like the "-" code?
 | |
| sub classify {
 | |
| 	my ($file) = @_;
 | |
| 
 | |
| 	if ($file eq "-") {
 | |
| 		if ($read_input) {
 | |
| 			$read_input = 0;
 | |
| 			while (<STDIN>) {
 | |
| 				chomp;
 | |
| 				classify($_);
 | |
| 			}
 | |
| 		}
 | |
| 		else {
 | |
| 			# silently ignore any extra occurrences of "-"
 | |
| 			# (we already reported them at startup)
 | |
| 		}
 | |
| 	}
 | |
| 	elsif (-d($file) and !-l($file)) {
 | |
| 		# don't follow directory symlinks, to prevent looping
 | |
| 		if (opendir(DIR, $file)) {
 | |
| 			for (readdir(DIR)) {
 | |
| 				my $path = "$file/$_";
 | |
| 				if (-d($path) and !-l($path)) {
 | |
| 					next if m/^\.\.?$/;	# skip . and ..
 | |
| 					classify($path) if $recurse;
 | |
| 				}
 | |
| 				elsif (-f($path)) {
 | |
| 					process_file($path);
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		else {
 | |
| 			nqwarn("can't process directory $file: $@\n");
 | |
| 		}
 | |
| 	}
 | |
| 	elsif (-f($file)) {
 | |
| 		# symlinks are okay for normal files
 | |
| 		process_file($file);
 | |
| 	}
 | |
| 	else {
 | |
| 		# skip anything else (devices, etc)
 | |
| 		nqwarn("skipping file: $file\n");
 | |
| 	}
 | |
| }
 | |
| 
 | |
| # ----------------------------------------------------------------------
 | |
| 
 | |
| sub try {
 | |
| 	my ($err) = @_;
 | |
| 	if ($err and $err !~ /Warning (315|330):/) {
 | |
| 		die("imagemagick problem: $err\n");
 | |
| 	}
 | |
| }
 | |
| 
 | |
| sub fingerprint {
 | |
| 	my ($file) = @_;
 | |
| 	my $blob;
 | |
| 
 | |
| 	# imagemagick doesn't always catch output from the programs
 | |
| 	# it spawns, so we have to clean up for it...
 | |
| 	open(SAVED_OUT, ">&", \*STDOUT) or nqdie2("open(/dev/null): $!");
 | |
| 	open(SAVED_ERR, ">&", \*STDERR) or nqdie2("open(/dev/null): $!");
 | |
| 	open(STDOUT, ">/dev/null");
 | |
| 	open(STDERR, ">/dev/null");
 | |
| 
 | |
| 	$inFP = 1;
 | |
| 	my $result = eval {
 | |
| 		if ((mimetype($file)||'') =~ /^(audio|video)/) {
 | |
| 			die("not fingerprinting A/V file: $file\n");
 | |
| 		}
 | |
| 
 | |
| 		if (!$image->Ping($file)) {
 | |
| 			die("not fingerprinting unknown-type file: $file\n");
 | |
| 		}
 | |
| 
 | |
| 		try $image->Read($file);
 | |
| 
 | |
| 		if ($#$image<0) {
 | |
| 			die("fingerprint: not enough image data for $file");
 | |
| 		}
 | |
| 		else {
 | |
| 			$#$image = 0;
 | |
| 		}
 | |
| 		try $image->Sample("160x160!");
 | |
| 		try $image->Modulate(saturation=>-100);
 | |
| 		try $image->Blur(radius=>3,sigma=>99);
 | |
| 		try $image->Normalize();
 | |
| 		try $image->Equalize();
 | |
| 		try $image->Sample("16x16");
 | |
| 		try $image->Threshold();
 | |
| 		try $image->Set(magick=>'mono');
 | |
| 
 | |
| 		($blob) = $image->ImageToBlob();
 | |
| 		if (!defined($blob)) {
 | |
| 			die("This can't happen! undefined blob for: $file\n");
 | |
| 		}
 | |
| 	};
 | |
| 
 | |
| 	$inFP = 0;
 | |
| 	@$image = ();
 | |
| 
 | |
| 	open(STDOUT, ">&", \*SAVED_OUT) or nqdie2("open(/dev/null): $!");
 | |
| 	open(STDERR, ">&", \*SAVED_ERR) or nqdie2("open(/dev/null): $!");
 | |
| 	close(SAVED_OUT);
 | |
| 	close(SAVED_ERR);
 | |
| 
 | |
| 	if (defined $result) {
 | |
| 		return $blob;
 | |
| 	}
 | |
| 	else {
 | |
| 		nqwarn($@);
 | |
| 		return undef;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| sub finddupes {
 | |
| 	my @matches = diffbits(\%fpcache, \%filelist, $threshold, $add);
 | |
| 
 | |
| 	my (%set, %ptr, %val);
 | |
| 
 | |
| 	while (@matches) {
 | |
| 		my $a = shift(@matches);
 | |
| 		my $b = shift(@matches);
 | |
| 		my $c = shift(@matches);
 | |
| 		$set{$a} = 1;
 | |
| 		$set{$b} = 1;
 | |
| 
 | |
| 		# cf. debian bug #87013
 | |
| 
 | |
| 		if (!defined($ptr{$a}) and !defined($ptr{$b})) {
 | |
| 			$ptr{$a} = $a;
 | |
| 			push @{$val{$a}}, $a, $b;
 | |
| 			$ptr{$b} = $a;
 | |
| 			$#{$val{$b}} = 0;
 | |
| 		}
 | |
| 		elsif (defined($ptr{$a}) and !defined($ptr{$b})) {
 | |
| 			push @{$val{$ptr{$a}}}, $b;
 | |
| 			$ptr{$b} = $ptr{$a};
 | |
| 			$#{$val{$b}} = 0;
 | |
| 		}
 | |
| 		elsif (!defined($ptr{$a}) and defined($ptr{$b})) {
 | |
| 			push @{$val{$ptr{$b}}}, $a;
 | |
| 			$ptr{$a} = $ptr{$b};
 | |
| 			$#{$val{$a}} = 0;
 | |
| 		}
 | |
| 		elsif ($ptr{$a} ne $ptr{$b}) {
 | |
| 			my $valptrb = $val{$ptr{$b}};
 | |
| 			push @{$val{$ptr{$a}}}, @{$valptrb};
 | |
| 			for my $bkey (@{$valptrb}) {
 | |
| 				$ptr{$bkey} = $ptr{$a};
 | |
| 			}
 | |
| 			$#$valptrb = 0;
 | |
| 			# else $val{$a} is $val{$b} already
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	my $cnt = 0;
 | |
| 	for my $k (keys %filelist, keys %fpcache) {
 | |
| 		$set{$cnt} = $k if defined $set{$cnt};
 | |
| 		$cnt++;
 | |
| 	}
 | |
| 
 | |
| 	# FIXME: What is the proper format of collection files?
 | |
| 	#	It seems to be poorly defined, and gthumb parses them
 | |
| 	#	differently from gqview itself. In particular, gthumb
 | |
| 	#	seems to misparse comment lines, and the sort pragma
 | |
| 	#	it defines seems to be fragile wrt whitespace.
 | |
| 	if ($collection) {
 | |
| 		open(COLLECTION, "> $collection") or nqdie2("open(> $collection): $!\n");
 | |
| 		select(COLLECTION);
 | |
| 
 | |
| 		print $collectionHeader;
 | |
| 		for my $k (keys %ptr) {
 | |
| 			next unless $ptr{$k} eq $k;
 | |
| 			for ( @{$val{$ptr{$k}}} ) {
 | |
| 				my $name = $set{$_};
 | |
| 
 | |
| 				if ( $name =~ /[\n"]/s ) {
 | |
| 					nqwarn("excluded from $collection: $name\n");
 | |
| 				}
 | |
| 				else {
 | |
| 					print qq{"$name"\n};
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		print $collectionFooter;
 | |
| 		close(COLLECTION);
 | |
| 	}
 | |
| 
 | |
| 	if ($script) {
 | |
| 		open(COMMANDS, "> $script") or nqdie2("open(> $script): $!\n");
 | |
| 	}
 | |
| 	else {
 | |
| 		open(COMMANDS, "| /bin/sh") or nqdie2("open(| /bin/sh): $!\n");
 | |
| 	}
 | |
| 	select(COMMANDS);
 | |
| 
 | |
| 	if ($script or $program or !$collection) {
 | |
| 		$program = 'VIEW' unless $program;
 | |
| 		print $scriptHeader;
 | |
| 		printScriptFile if $scriptFile;
 | |
| 		print @scriptCode if @scriptCode;
 | |
| 		for my $k (keys %ptr) {
 | |
| 			next unless $ptr{$k} eq $k;
 | |
| 			print join(" \\\n\t",
 | |
| 				$program,
 | |
| 				( map { quotemeta $set{$_} } @{$val{$ptr{$k}}} ),
 | |
| 				";\n"
 | |
| 			);
 | |
| 		}
 | |
| 		print "\nEND;\n\n";
 | |
| 	}
 | |
| 	close(COMMANDS);
 | |
| }
 | |
| 
 | |
| # ======================================================================
 | |
| 
 | |
| __DATA__
 | |
| 
 | |
| =encoding UTF-8
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| findimagedupes - Finds visually similar or duplicate images
 | |
| 
 | |
| =head1 SYNOPSIS
 | |
| 
 | |
| findimagedupes [option ...] [--] [ - | [file ...] ]
 | |
| 
 | |
|    Options:
 | |
|       -f, --fingerprints=FILE    -c, --collection=FILE
 | |
|       -M, --merge=FILE           -p, --program=PROGRAM
 | |
|       -P, --prune                -s, --script=FILE
 | |
|       -a, --add                  -i, --include=TEXT
 | |
|       -r, --rescan               -I, --include-file=FILE
 | |
|       -n, --no-compare
 | |
|                                  -q, --quiet
 | |
|       -t, --threshold=AMOUNT     -v, --verbosity=LIST
 | |
| 
 | |
|       -0, --null                 -h, --help
 | |
|       -R, --recurse                  --man
 | |
|                                      --version
 | |
| 
 | |
| With no options, compares the specified files and does not use nor
 | |
| update any fingerprint database.
 | |
| 
 | |
| Directories of images may be specified instead of individual files;
 | |
| Sub-directories of these are not searched unless --recurse is used.
 | |
| 
 | |
| =head1 INSTALLATION
 | |
| 
 | |
| If you use linux, your distribution may include a prepackaged version.
 | |
| For example, Debian and Ubuntu do.
 | |
| 
 | |
| Otherwise, at a minimum you'll need Perl with the modules listed at the
 | |
| top of the findimagedupes script. Also the GraphicksMagick package.
 | |
| 
 | |
| You may need to change Inline's C<DIRECTORY> to point somewhere else.
 | |
| Read the Inline module documentation for details.
 | |
| 
 | |
| =head1 OPTIONS
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<-0>, B<--null>
 | |
| 
 | |
| If a file C<-> is given, a list of files is read from stdin.
 | |
| 
 | |
| Without B<-0>, the list is specified one file per line, such as
 | |
| produced by find(1) with its C<-print> option.
 | |
| 
 | |
| With B<-0>, the list is expected to be null-delimited, such as
 | |
| produced by find(1) with its C<-print0> option.
 | |
| 
 | |
| =item B<-a>, B<--add>
 | |
| 
 | |
| Only look for duplicates of files specified on the commandline.
 | |
| 
 | |
| Matches are also sought in any fingerprint databases specified.
 | |
| 
 | |
| =item B<-c>, B<--collection>=I<FILE>
 | |
| 
 | |
| Create GQView collection I<FILE>.gqv of duplicates.
 | |
| 
 | |
| The program attempts to produce well-formed collections.
 | |
| In particular, it will print a warning and exclude any file
 | |
| whose name contains newline or doublequote. (In this situation,
 | |
| gqview(1) seems to create a .gqv collection file that it
 | |
| silently fails to read back in properly.)
 | |
| 
 | |
| =item B<-d>, B<--debug>=I<OPTS>
 | |
| 
 | |
| Enable debugging output. Options I<OPTS> are subject to change.
 | |
| See the program source for details.
 | |
| 
 | |
| =item B<-f>, B<--fingerprints>=I<FILE>
 | |
| 
 | |
| Use I<FILE> as fingerprint database.
 | |
| 
 | |
| May be abbreviated as B<--fp> or B<--db>.
 | |
| 
 | |
| This option may be given multiple times when B<--merge> is used.
 | |
| (Note: I<FILE> could contain commas, so multiple databases may
 | |
| not be specified as a single comma-delimited list.)
 | |
| 
 | |
| =item B<-h>, B<--help>
 | |
| 
 | |
| Print usage and option sections of this manual, then exit.
 | |
| 
 | |
| =item B<-i>, B<--include>=I<TEXT>
 | |
| 
 | |
| I<TEXT> is Bourne-shell code to customise B<--script>.
 | |
| 
 | |
| It is executed after any code included using B<--include-file>.
 | |
| 
 | |
| May be given multiple times. Code will be concatenated.
 | |
| 
 | |
| =item B<-I>, B<--include-file>=I<FILE>
 | |
| 
 | |
| I<FILE> is a file containing Bourne-shell code to customise 
 | |
| B<--script>. 
 | |
| 
 | |
| It is executed before any code included using B<--include>.
 | |
| 
 | |
| =item B<--man>
 | |
| 
 | |
| Display the full documentation, using default pager, then exit.
 | |
| 
 | |
| =item B<-M>, B<--merge>=I<FILE>
 | |
| 
 | |
| Takes any databases specified with B<--fingerprints>
 | |
| and merges them into a new database called I<FILE>.
 | |
| Conflicting fingerprints for an image will cause one of two actions to occur:
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item 1.
 | |
| 
 | |
| If the image does not exist, then the entry is elided.
 | |
| 
 | |
| =item 2.
 | |
| 
 | |
| If the image does exist, then the old information is ignored
 | |
| and a new fingerprint is generated from scratch.
 | |
| 
 | |
| =back
 | |
| 
 | |
| By default, image existence is not checked unless there is a conflict.
 | |
| To force removal of defunct data, use B<--prune> as well.
 | |
| 
 | |
| A list of image files is not required if this option is used.
 | |
| However, if a list is provided, fingerprint data for the files
 | |
| will be copied or (re)generated as appropriate.
 | |
| 
 | |
| When B<--merge> is used, the original fingerprint databases are not modified,
 | |
| even if B<--prune> is used.
 | |
| 
 | |
| If multiple fingerprint databases are to be used but the merge output is
 | |
| not required, specify: B<--merge>=I</dev/null>
 | |
| 
 | |
| See also: B<--rescan>
 | |
| 
 | |
| =item B<-n>, B<--no-compare>
 | |
| 
 | |
| Don't look for duplicates.
 | |
| 
 | |
| =item B<-p>, B<--program>=I<PROGRAM>
 | |
| 
 | |
| Launch I<PROGRAM> (in foreground) to view each set of dupes.
 | |
| 
 | |
| I<PROGRAM> must be the full path to an existing executable file.
 | |
| For more flexibility, see the B<--include> and B<--include-file>
 | |
| options.
 | |
| 
 | |
| See also: B<--script>
 | |
| 
 | |
| =item B<-P>, B<--prune>
 | |
| 
 | |
| Remove fingerprint data for images that do not exist any more.
 | |
| Has no effect unless B<--fingerprints> or B<--merge> is also used.
 | |
| 
 | |
| Databases specified by B<--fingerprints> are only modified if
 | |
| B<--merge> is not used.
 | |
| 
 | |
| =item B<-q>, B<--quiet>
 | |
| 
 | |
| This option may be given multiple times.
 | |
| 
 | |
| Usually, progress, warning and error messages are printed on stderr.
 | |
| If this option is given, warnings are not displayed.
 | |
| If it is given twice or more, errors are not displayed either.
 | |
| 
 | |
| Information requested with B<--verbosity> is still displayed.
 | |
| 
 | |
| =item B<-R>, B<--recurse>
 | |
| 
 | |
| Use B<--recurse> to search recursively for images inside
 | |
| subdirectories. For historical reasons, the default is to not do so.
 | |
| To avoid looping, symbolic links to directories are never followed.
 | |
| 
 | |
| =item B<-r>, B<--rescan>
 | |
| 
 | |
| (Re)generate all fingerprints, not just any that are unknown.
 | |
| 
 | |
| If used with B<--add>, only the fingerprints of files specified
 | |
| on the commandline are (re)generated.
 | |
| 
 | |
| Implies B<--prune>.
 | |
| 
 | |
| =item B<-s>, B<--script>=I<FILE>
 | |
| 
 | |
| When used with B<--program>, I<PROGRAM> is not launched immediately.
 | |
| Instead sh(1)-style commands are saved to I<FILE>.
 | |
| This script may be edited (if desired) and then executed manually.
 | |
| 
 | |
| When used without B<--program>, two skeletal shell functions
 | |
| are generated: C<VIEW> simply echo(1)s its arguments; 
 | |
| the empty function C<END> runs after files-processing is finished.
 | |
| 
 | |
| To display to terminal (or feed into a pipe), use C<-> as I<FILE>.
 | |
| 
 | |
| If B<--script> is not given, the script is still created in memory and
 | |
| is executed immediately. So, with the default VIEW and END functions,
 | |
| lines containing sets of duplicates are displayed. See: B<EXAMPLES>
 | |
| 
 | |
| See also: B<--include>, B<--include-file>
 | |
| 
 | |
| =item B<-t>, B<--threshold>=I<AMOUNT>
 | |
| 
 | |
| Use I<AMOUNT> as threshold of similarity.
 | |
| Append C<%> to give a percentage or C<b> for bits.
 | |
| For backwards compatibility, a number with no unit is treated as
 | |
| a percentage. Percentage is the minimum required for a match;
 | |
| bits is the maximum that may differ: bits=floor(2.56(100-percent))
 | |
| 
 | |
| A fractional part may be given but it is only accurate to 100/256
 | |
| (0.390625) for percentage and it is meaningless for C<bits>.
 | |
| Default is C<90%> (C<25b>) if not specified.
 | |
| 
 | |
| =item B<-v>, B<--verbosity>=I<LIST>
 | |
| 
 | |
| Enable display of informational messages to stdout,
 | |
| where I<LIST> is a comma-delimited list of:
 | |
| 
 | |
| =over 8
 | |
| 
 | |
| =item B<md5>
 | |
| 
 | |
| Display the checksum for each file, as per md5sum(1).
 | |
| 
 | |
| =item B<fingerprint> | B<fp>
 | |
| 
 | |
| Display the base64-encoded fingerprint of each file.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Alternatively, B<--verbosity> may be given multiple times, and accumulates.
 | |
| Note that this may not be sensible. For example, to be useful,
 | |
| B<md5> output probably should not be merged with B<fingerprint> data.
 | |
| 
 | |
| =item B<--version>
 | |
| 
 | |
| Display the program version, then exit.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| B<findimagedupes> compares a list of files for visual similarity.
 | |
| 
 | |
| =over 1
 | |
| 
 | |
| =item To calculate an image fingerprint:
 | |
| 
 | |
|  1) Read image.
 | |
|  2) Resample to 160x160 to standardize size.
 | |
|  3) Grayscale by reducing saturation.
 | |
|  4) Blur a lot to get rid of noise.
 | |
|  5) Normalize to spread out intensity as much as possible.
 | |
|  6) Equalize to make image as contrasty as possible.
 | |
|  7) Resample again down to 16x16.
 | |
|  8) Reduce to 1bpp.
 | |
|  9) The fingerprint is this raw image data.
 | |
| 
 | |
| =item To compare two images for similarity:
 | |
| 
 | |
|  1) Take fingerprint pairs and xor them.
 | |
|  2) Compute the percentage of 1 bits in the result.
 | |
|  3) If percentage exceeds threshold, declare files to be similar.
 | |
| 
 | |
| =back
 | |
| 
 | |
| 
 | |
| =head1 RETURN VALUE
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item B<0>
 | |
| 
 | |
| Success.
 | |
| 
 | |
| =item B<1>
 | |
| 
 | |
| Usage information was requested (B<--help> or B<--man>), or there
 | |
| were warnings.
 | |
| 
 | |
| =item B<2>
 | |
| 
 | |
| Invalid options or arguments were provided.
 | |
| 
 | |
| =item B<3>
 | |
| 
 | |
| Runtime error.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Any other return values indicate an internal error of some sort.
 | |
| 
 | |
| =head1 DIAGNOSTICS
 | |
| 
 | |
| To be written.
 | |
| 
 | |
| =head1 EXAMPLES
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item C<<<< findimagedupes -R -- . >>>>
 | |
| 
 | |
| Look for and compare images in all subdirectories of the current directory.
 | |
| 
 | |
| =item C<<<< find . -type f -print0 | findimagedupes -0 -- - >>>>
 | |
| 
 | |
| Same as above.
 | |
| 
 | |
| =item C<<<< findimagedupes -i 'echo "# sort: manual"' -i 'VIEW(){ for f in "$@"; do echo \"file://$f\"; done; }' -- *.jpg > dupes.gqv >>>>
 | |
| 
 | |
| Use script hooks to produce collection-style output
 | |
| suitable for use with gthumb(1).
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 FILES
 | |
| 
 | |
| To be written.
 | |
| 
 | |
| =head1 BUGS
 | |
| 
 | |
| There is a memory leak somewhere.
 | |
| 
 | |
| Killing the program may corrupt the fingerprint database(s).
 | |
| 
 | |
| The program does not lock the fingerprint database although concurrent
 | |
| write access to it is unsafe.
 | |
| 
 | |
| GraphicsMagick does not expose its auto-orient functionality to Perl.
 | |
| 
 | |
| Changing version of GraphicsMagick invalidates fingerprint databases.
 | |
| 
 | |
| 
 | |
| =head1 NOTES
 | |
| 
 | |
| Directory recursion is deliberately not implemented:
 | |
| Composing a file-list and using it with C<-> is a more flexible approach.
 | |
| 
 | |
| Repetitions are culled before comparisons take place, so a commandline
 | |
| like C<findimagedupes a.jpg a.jpg> will not produce a match.
 | |
| 
 | |
| The program needs a lot of memory. Probably not an issue, unless your
 | |
| machine has less than 128MB of free RAM and you try to compare more than
 | |
| a hundred-thousand files at once (and the program will run quite slowly
 | |
| with that many files anyway---about eight hours initially to generate
 | |
| fingerprints and another ten minutes to do the actual comparing).
 | |
| 
 | |
| Fingerprinting images is a bottleneck but unfortunately the program was
 | |
| not written with parallel processing in mind. For a workaround, see:
 | |
| https://github.com/jhnc/findimagedupes/issues/9
 | |
| 
 | |
| =head1 SEE ALSO
 | |
| 
 | |
| find(1), md5sum(1)
 | |
| 
 | |
| B<gqview> - GTK based multiformat image viewer
 | |
| 
 | |
| B<gthumb> - an image viewer and browser for GNOME
 | |
| 
 | |
| =head1 AUTHOR
 | |
| 
 | |
| Jonathan H N Chin <code@jhnc.org>
 | |
| 
 | |
| =head1 COPYRIGHT AND LICENSE
 | |
| 
 | |
|  Copyright © 2006-2022 by Jonathan H N Chin <code@jhnc.org>.
 | |
| 
 | |
|  This program is free software; you may redistribute it and/or modify
 | |
|  it under the terms of the GNU General Public License as published by
 | |
|  the Free Software Foundation; either version 3 of the License, or
 | |
|  (at your option) any later version.
 | |
| 
 | |
|  This program is distributed in the hope that it will be useful,
 | |
|  but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  GNU General Public License for more details.
 | |
| 
 | |
|  You should have received a copy of the GNU General Public License
 | |
|  along with this program; if not, write to the Free Software
 | |
|  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
| 
 | |
| =head1 HISTORY
 | |
| 
 | |
| This code has been written from scratch. However it owes its existence
 | |
| to B<findimagedupes> by Rob Kudla and uses the same duplicate-detection
 | |
| algorithm.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| __C__
 | |
| 
 | |
| /* efficient bit-comparison */
 | |
| 
 | |
| #include <stdint.h>
 | |
| #include <string.h>
 | |
| 
 | |
| #define LOOKUP_SIZE 65536
 | |
| #define FP_CHUNKS 16
 | |
| 
 | |
| typedef uint16_t FP[FP_CHUNKS];
 | |
| 
 | |
| void diffbits (SV* oldfiles, SV* newfiles, unsigned int threshold, unsigned limit) {
 | |
| 	FP *the_data, *a, *b;
 | |
| 	unsigned int lookup[LOOKUP_SIZE];
 | |
| 	unsigned int i, j, k, m, bits, old, new;
 | |
| 	HV *oldhash;
 | |
| 	HE *oldhash_entry;
 | |
| 	HV *newhash;
 | |
| 	HE *newhash_entry;
 | |
| 	unsigned int numkeys = 0;
 | |
| 	SV *sv_val;
 | |
| 	Inline_Stack_Vars;
 | |
| 
 | |
| 	if ((threshold<0) || (threshold>256)) {
 | |
| 		croak("ridiculous threshold specified");
 | |
| 	}
 | |
| 
 | |
| 	/* pack fingerprints into C array */
 | |
| 	/* partly lifted from Inline::C-Cookbook */
 | |
| 
 | |
| 	if (! SvROK(newfiles)) {
 | |
| 		croak("newfiles is not a reference");
 | |
| 	}
 | |
| 	newhash = (HV *)SvRV(newfiles);
 | |
| 	new = hv_iterinit(newhash);
 | |
| 
 | |
| 	if (! SvROK(oldfiles)) {
 | |
| 		croak("oldfiles is not a reference");
 | |
| 	}
 | |
| 	oldhash = (HV *)SvRV(oldfiles);
 | |
| 	old = hv_iterinit(oldhash);
 | |
| 
 | |
| 	numkeys = new+old;
 | |
| 	if (numkeys<2) {
 | |
| 		/* minor optimization: return without doing anything */
 | |
| 		/* malloc(0) could be bad... */
 | |
| 		Inline_Stack_Void;
 | |
| 	}
 | |
| 	the_data = (FP *)malloc(numkeys*sizeof(FP));
 | |
| 	if (!the_data) {
 | |
| 		croak("malloc failed");
 | |
| 	}
 | |
| 
 | |
| 	for (i = 0; i<new; i++) {
 | |
| 		newhash_entry = hv_iternext(newhash);
 | |
| 		sv_val = hv_iterval(newhash, newhash_entry);
 | |
| 		memcpy(the_data+i, SvPV(sv_val, PL_na), sizeof(FP));
 | |
| 	}
 | |
| 	for (i = new; i<numkeys; i++) {
 | |
| 		oldhash_entry = hv_iternext(oldhash);
 | |
| 		sv_val = hv_iterval(oldhash, oldhash_entry);
 | |
| 		memcpy(the_data+i, SvPV(sv_val, PL_na), sizeof(FP));
 | |
| 	}
 | |
| 
 | |
| 	/* initialise lookup table */
 | |
| 	/* cf. https://graphics.stanford.edu/~seander/bithacks.html */
 | |
| 	for (i=0; i<LOOKUP_SIZE; i++) {
 | |
| 		lookup[i] = lookup[i/2] + (i&1);
 | |
| 	}
 | |
| 
 | |
| 	/* look for matches */
 | |
| 	Inline_Stack_Reset;
 | |
| 	for (a=the_data, i=0, m=(limit>0 ? new : numkeys-1); i<m; a++, i++) {
 | |
| 		for (b=a+1, j=i+1; j<numkeys; b++, j++) {
 | |
| 			for (bits=0, k=0; k<FP_CHUNKS; k++) {
 | |
| 				bits += lookup[(*a)[k]^(*b)[k]];
 | |
| 				if (bits > threshold) goto abortmatch;
 | |
| 			}
 | |
| 			/* if (bits <= threshold) */ {
 | |
| 				Inline_Stack_Push(sv_2mortal(newSViv(i)));
 | |
| 				Inline_Stack_Push(sv_2mortal(newSViv(j)));
 | |
| 				Inline_Stack_Push(sv_2mortal(newSViv(bits)));
 | |
| 			}
 | |
| abortmatch:;
 | |
| 		}
 | |
| 	}
 | |
| 	Inline_Stack_Done;
 | |
| 
 | |
| 	/* clean up */
 | |
| 	free(the_data);
 | |
| }
 | |
| 
 | 
