1123 lines
27 KiB
Perl
Executable File
1123 lines
27 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
my $Id = '';
|
|
#
|
|
# findimagedupes - Finds visually similar or duplicate images
|
|
#
|
|
# Copyright © 2006-2022 by Jonathan H N Chin <code@jhnc.org>.
|
|
#
|
|
# This program is free software; you may redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
#
|
|
|
|
use strict;
|
|
|
|
require 5.006_001;
|
|
|
|
use Cwd qw(realpath);
|
|
use DB_File;
|
|
use Digest::MD5 qw(md5_hex);
|
|
use Getopt::Long qw(:config no_ignore_case require_order);
|
|
use File::MimeInfo::Magic;
|
|
use File::Temp qw(tempdir tempfile);
|
|
use Graphics::Magick;
|
|
use MIME::Base64;
|
|
use Pod::Usage;
|
|
|
|
use Inline
|
|
C => 'DATA',
|
|
NAME => 'findimagedupes',
|
|
DIRECTORY => '/tmp';
|
|
|
|
# ----------------------------------------------------------------------
|
|
#
|
|
# option parsing
|
|
#
|
|
|
|
use vars qw(
|
|
$null
|
|
$add
|
|
$collection
|
|
@debug %debug
|
|
@fpdb
|
|
$merge
|
|
$mergeFile
|
|
$nocompare
|
|
$program
|
|
$quiet
|
|
$rescan
|
|
$recurse
|
|
$script
|
|
@scriptCode
|
|
$scriptFile
|
|
$threshold
|
|
@verbosity
|
|
$prune
|
|
);
|
|
|
|
$add = 0;
|
|
$quiet = 0;
|
|
$threshold = '90%';
|
|
|
|
my %opt;
|
|
GetOptions(
|
|
'0|null' => \$null,
|
|
'a|add' => \$add,
|
|
'c|collection=s' => sub { ($collection = $_[1]) =~ s/([.]gqv)?$/.gqv/ },
|
|
'd|debug=s' => \@debug,
|
|
'f|fingerprints|fp|db=s' => \@fpdb,
|
|
'h|?|help' => sub { pod2usage(-verbose => 1) },
|
|
'man' => sub { pod2usage(-verbose => 2) },
|
|
'M|merge=s' => sub { $merge = 1; $mergeFile = $_[1]; },
|
|
'n|no-compare' => \$nocompare,
|
|
'P|prune' => \$prune,
|
|
'p|program=s' => \$program,
|
|
'q|quiet+' => \$quiet,
|
|
'R|recurse' => \$recurse,
|
|
'r|rescan' => \$rescan,
|
|
's|script=s' => \$script,
|
|
't|threshold=s' => \$threshold,
|
|
'v|verbosity=s' => \@verbosity,
|
|
'version' => sub { print "findimagedupes $Id\n"; exit(0); },
|
|
|
|
'i|include=s' => \@scriptCode,
|
|
'I|include-file=s' => \$scriptFile,
|
|
) or pod2usage(-verbose => 0);
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
my @errors = ();
|
|
my @warnings = ();
|
|
|
|
sub exitvalue {
|
|
return(2) if @errors;
|
|
return(1) if @warnings;
|
|
return(0);
|
|
}
|
|
|
|
sub mkerr { push @errors, join("", @_); }
|
|
sub mkwarn { push @warnings, join("", @_); }
|
|
|
|
sub nqprint { print(@_) unless $quiet; }
|
|
sub nqwarn { warn("Warning: ", @_) unless $quiet; }
|
|
sub nqdie { nqwarn(@_); die; }
|
|
sub nqdie2 { warn("Error: ", @_) if $quiet<2; die; }
|
|
sub nqexit { warn("Error: ", @_) if $quiet<2; exit(3); }
|
|
|
|
my $inFP = 0;
|
|
$SIG{SEGV} = sub { die $inFP ? "caught segfault" : ()};
|
|
|
|
sub printScriptFile {
|
|
print("# BEGIN USER FILE INCLUDE\n");
|
|
open(USERSCRIPT, "< $scriptFile") or die("$!\n");
|
|
while (<USERSCRIPT>) {
|
|
chomp;
|
|
print "$_\n";
|
|
}
|
|
close(USERSCRIPT);
|
|
print("# END USER FILE INCLUDE\n\n");
|
|
}
|
|
|
|
# ----------------------------------------------------------------------
|
|
#
|
|
# setup
|
|
#
|
|
|
|
my ($verb_fp, $verb_md5);
|
|
|
|
my $read_input = grep(/^-$/, @ARGV);
|
|
|
|
# XXX: can we tie these to save memory without breaking hv_iterinit() ?
|
|
my (%fpcache, %filelist);
|
|
|
|
my $image = Graphics::Magick->new;
|
|
|
|
for (@debug) { $debug{$_} = 1 }
|
|
|
|
for (@scriptCode) { chomp; $_ .= "\n" }
|
|
|
|
if (@scriptCode) {
|
|
unshift(@scriptCode, "# BEGIN USER CODE INCLUDE\n" );
|
|
push(@scriptCode, "# END USER CODE INCLUDE\n\n" );
|
|
}
|
|
|
|
my $scriptHeader = <<'EOD';
|
|
#!/bin/sh
|
|
|
|
VIEW(){
|
|
echo "$@"
|
|
}
|
|
END(){
|
|
:;
|
|
}
|
|
|
|
EOD
|
|
|
|
my $collectionHeader = <<'EOD';
|
|
#GQview collection
|
|
#created with findimagedupes
|
|
# sort: manual
|
|
EOD
|
|
|
|
my $collectionFooter = <<'EOD';
|
|
#end
|
|
EOD
|
|
|
|
# ----------------------------------------------------------------------
|
|
#
|
|
# sanity checks
|
|
#
|
|
|
|
# +----------+
|
|
# | warnings |
|
|
# +----------+
|
|
|
|
if ($read_input>1) {
|
|
mkwarn("extra occurrences of \"-\" will be ignored");
|
|
}
|
|
|
|
if ($null and !$read_input) {
|
|
mkwarn("--null has no effect in this context");
|
|
}
|
|
|
|
if ($prune and !@fpdb) {
|
|
mkwarn("--prune has no effect in this context");
|
|
}
|
|
|
|
if ($nocompare) {
|
|
mkwarn("--program ignored because --no-compare given") if $program;
|
|
mkwarn("--script ignored because --no-compare given") if $script;
|
|
}
|
|
|
|
if ($merge and $mergeFile eq '/dev/null') {
|
|
$mergeFile = undef; # use DB_File memory database
|
|
mkwarn("merge database is temporary");
|
|
}
|
|
|
|
if ($prune and $merge and !defined($mergeFile)) {
|
|
mkwarn("--prune is pointless when --merge database is temporary");
|
|
}
|
|
|
|
if (@warnings and !$quiet) {
|
|
warn( join("\n", map {"Warning: $_"} @warnings), "\n" );
|
|
}
|
|
|
|
# +--------+
|
|
# | errors |
|
|
# +--------+
|
|
|
|
if ($collection and -e($collection)) {
|
|
mkerr("Output file for --collection exists: $collection");
|
|
}
|
|
|
|
for (@fpdb) {
|
|
mkerr("File for --fingerprints does not exist: $_")
|
|
unless (-f($_) or (@fpdb==1 and !$merge));
|
|
}
|
|
|
|
if (@fpdb>1 and !$merge) {
|
|
mkerr("Require --merge if using multiple fingerprint databases");
|
|
}
|
|
|
|
if ($merge and defined($mergeFile) and -e($mergeFile)) {
|
|
mkerr("Output file for --merge exists: $mergeFile");
|
|
}
|
|
|
|
if ($program) {
|
|
if (! -e($program)) {
|
|
mkerr("File for --program does not exist: $program");
|
|
}
|
|
elsif (! -x($program)) {
|
|
mkerr("File for --program not executable: $program");
|
|
}
|
|
}
|
|
|
|
if ($script and -e($script)) {
|
|
mkerr("Output file for --script exists: $script");
|
|
}
|
|
|
|
if ($scriptFile) {
|
|
if (! -f($scriptFile)) {
|
|
mkerr("File for --include-file does not exist: $scriptFile");
|
|
}
|
|
elsif (! -r($scriptFile)) {
|
|
mkerr("File for --include-file is not readable: $scriptFile");
|
|
}
|
|
}
|
|
|
|
if (my ($thres_val, $thres_unit) = $threshold =~ m/^\s*(\d+(?:[.]\d+)?)\s*([%b]?)\s*$/) {
|
|
if ($thres_unit eq '' or $thres_unit eq '%') {
|
|
# percentage to bits
|
|
$threshold = int(2.56 * (100 - $thres_val));
|
|
}
|
|
elsif ($thres_unit =~ m/^b/i) {
|
|
# already in bits
|
|
$threshold = int($thres_val);
|
|
}
|
|
}
|
|
else {
|
|
# error
|
|
$threshold = -1;
|
|
}
|
|
if ($threshold>256 or $threshold<0) {
|
|
mkerr("--threshold takes values between 0.0% .. 100.0% or 0b .. 256b");
|
|
}
|
|
|
|
for (split(",", join(",", @verbosity))) {
|
|
/^(fingerprint|fp)$/ && do { $verb_fp = 1; next; };
|
|
/^md5$/ && do { $verb_md5 = 1; next; };
|
|
mkerr("unknown option to --verbosity: $_");
|
|
}
|
|
|
|
if (@errors) {
|
|
exit(exitvalue()) unless $quiet<2;
|
|
pod2usage(
|
|
-verbose => 0,
|
|
-exitval => exitvalue(),
|
|
-msg => join("\n", map {"Error: $_"} @errors),
|
|
);
|
|
}
|
|
|
|
# +-------------------------------------------------+
|
|
# | last chance to abort without altering any files |
|
|
# +-------------------------------------------------+
|
|
|
|
unless (@ARGV>0 or @fpdb or $merge) {
|
|
exit(exitvalue()) unless $quiet<2;
|
|
warn("Nothing to do!\n") unless @warnings or $quiet;
|
|
pod2usage(
|
|
-verbose => 0,
|
|
-exitval => exitvalue(),
|
|
);
|
|
}
|
|
|
|
# ----------------------------------------------------------------------
|
|
#
|
|
# load fingerprint cache
|
|
#
|
|
|
|
my @regen = ();
|
|
|
|
for my $db (@fpdb) {
|
|
my %data;
|
|
tie(%data, 'DB_File', $db) or nqexit("tie($db): $!\n");
|
|
while (my ($file, $fp) = each %data) {
|
|
next if ($prune && !-f($file));
|
|
|
|
if (exists $fpcache{$file}) {
|
|
if ($fpcache{$file} ne $fp) {
|
|
# fingerprint mismatch, force regeneration
|
|
push @regen, $file;
|
|
delete $fpcache{$file};
|
|
}
|
|
}
|
|
else {
|
|
$fpcache{$file} = $fp;
|
|
}
|
|
}
|
|
untie(%data);
|
|
}
|
|
|
|
# ----------------------------------------------------------------------
|
|
#
|
|
# build file list
|
|
#
|
|
|
|
my %mergelist;
|
|
my $rw = 0;
|
|
|
|
if ($merge) {
|
|
tie(%mergelist, 'DB_File', $mergeFile) or nqexit("tie($merge): $!\n");
|
|
%mergelist = %fpcache;
|
|
$rw = 1;
|
|
}
|
|
elsif (@fpdb==1) {
|
|
tie(%mergelist, 'DB_File', $fpdb[0]) or nqexit("tie($fpdb[0]): $!\n");
|
|
%mergelist = %fpcache if $prune;
|
|
$rw = 1;
|
|
}
|
|
|
|
$| = 1;
|
|
$/ = "\0" if $null;
|
|
|
|
for (@ARGV ? @ARGV : @regen) {
|
|
classify($_);
|
|
}
|
|
|
|
untie(%mergelist);
|
|
|
|
finddupes() unless $nocompare;
|
|
|
|
exit 0;
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
sub process_file {
|
|
my ($path) = @_;
|
|
my $file = realpath($path); # normalize to absolute canonical path
|
|
if (!$file) {
|
|
nqwarn("skipping bogus file: $path\n");
|
|
}
|
|
else {
|
|
my $fp;
|
|
if ($rescan or !exists $fpcache{$file}) {
|
|
$fp = fingerprint($file);
|
|
}
|
|
elsif ($add and exists $fpcache{$file}) {
|
|
$fp = $fpcache{$file};
|
|
}
|
|
if ($fp) {
|
|
$filelist{$file} = $fp;
|
|
delete $fpcache{$file};
|
|
$mergelist{$file} = $filelist{$file} if $rw;
|
|
}
|
|
|
|
if ($verb_fp) {
|
|
my $fp = ( $filelist{$file} || $fpcache{$file} );
|
|
if ($fp) {
|
|
print(encode_base64($fp, ""), " $file\n");
|
|
}
|
|
else {
|
|
nqwarn("can't get fingerprint: $file\n");
|
|
}
|
|
}
|
|
if ($verb_md5) {
|
|
open(FILE, $file) or nqdie2("open($file): $!\n");
|
|
binmode(FILE);
|
|
my $digest = Digest::MD5->new->addfile(*FILE)->hexdigest;
|
|
if ($digest) {
|
|
print("$digest $file\n");
|
|
}
|
|
else {
|
|
nqwarn("can't get md5sum: $file\n");
|
|
}
|
|
close(FILE);
|
|
}
|
|
}
|
|
}
|
|
|
|
# XXX: This function is complicated by two things:
|
|
# - Historically we didn't recurse and it would be nice to
|
|
# retain nonrecursion as an option.
|
|
# - We need to process "." and ".." when they are given
|
|
# explicitly on the command line.
|
|
# Perhaps we could cleanup the code by rewriting using
|
|
# something like the "-" code?
|
|
sub classify {
|
|
my ($file) = @_;
|
|
|
|
if ($file eq "-") {
|
|
if ($read_input) {
|
|
$read_input = 0;
|
|
while (<STDIN>) {
|
|
chomp;
|
|
classify($_);
|
|
}
|
|
}
|
|
else {
|
|
# silently ignore any extra occurrences of "-"
|
|
# (we already reported them at startup)
|
|
}
|
|
}
|
|
elsif (-d($file) and !-l($file)) {
|
|
# don't follow directory symlinks, to prevent looping
|
|
if (opendir(DIR, $file)) {
|
|
for (readdir(DIR)) {
|
|
my $path = "$file/$_";
|
|
if (-d($path) and !-l($path)) {
|
|
next if m/^\.\.?$/; # skip . and ..
|
|
classify($path) if $recurse;
|
|
}
|
|
elsif (-f($path)) {
|
|
process_file($path);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
nqwarn("can't process directory $file: $@\n");
|
|
}
|
|
}
|
|
elsif (-f($file)) {
|
|
# symlinks are okay for normal files
|
|
process_file($file);
|
|
}
|
|
else {
|
|
# skip anything else (devices, etc)
|
|
nqwarn("skipping file: $file\n");
|
|
}
|
|
}
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
sub try {
|
|
my ($err) = @_;
|
|
if ($err and $err !~ /Warning (315|330):/) {
|
|
die("imagemagick problem: $err\n");
|
|
}
|
|
}
|
|
|
|
sub fingerprint {
|
|
my ($file) = @_;
|
|
my $blob;
|
|
|
|
# imagemagick doesn't always catch output from the programs
|
|
# it spawns, so we have to clean up for it...
|
|
open(SAVED_OUT, ">&", \*STDOUT) or nqdie2("open(/dev/null): $!");
|
|
open(SAVED_ERR, ">&", \*STDERR) or nqdie2("open(/dev/null): $!");
|
|
open(STDOUT, ">/dev/null");
|
|
open(STDERR, ">/dev/null");
|
|
|
|
$inFP = 1;
|
|
my $result = eval {
|
|
if ((mimetype($file)||'') =~ /^(audio|video)/) {
|
|
die("not fingerprinting A/V file: $file\n");
|
|
}
|
|
|
|
if (!$image->Ping($file)) {
|
|
die("not fingerprinting unknown-type file: $file\n");
|
|
}
|
|
|
|
try $image->Read($file);
|
|
|
|
if ($#$image<0) {
|
|
die("fingerprint: not enough image data for $file");
|
|
}
|
|
else {
|
|
$#$image = 0;
|
|
}
|
|
try $image->Sample("160x160!");
|
|
try $image->Modulate(saturation=>-100);
|
|
try $image->Blur(radius=>3,sigma=>99);
|
|
try $image->Normalize();
|
|
try $image->Equalize();
|
|
try $image->Sample("16x16");
|
|
try $image->Threshold();
|
|
try $image->Set(magick=>'mono');
|
|
|
|
($blob) = $image->ImageToBlob();
|
|
if (!defined($blob)) {
|
|
die("This can't happen! undefined blob for: $file\n");
|
|
}
|
|
};
|
|
|
|
$inFP = 0;
|
|
@$image = ();
|
|
|
|
open(STDOUT, ">&", \*SAVED_OUT) or nqdie2("open(/dev/null): $!");
|
|
open(STDERR, ">&", \*SAVED_ERR) or nqdie2("open(/dev/null): $!");
|
|
close(SAVED_OUT);
|
|
close(SAVED_ERR);
|
|
|
|
if (defined $result) {
|
|
return $blob;
|
|
}
|
|
else {
|
|
nqwarn($@);
|
|
return undef;
|
|
}
|
|
}
|
|
|
|
sub finddupes {
|
|
my @matches = diffbits(\%fpcache, \%filelist, $threshold, $add);
|
|
|
|
my (%set, %ptr, %val);
|
|
|
|
while (@matches) {
|
|
my $a = shift(@matches);
|
|
my $b = shift(@matches);
|
|
my $c = shift(@matches);
|
|
$set{$a} = 1;
|
|
$set{$b} = 1;
|
|
|
|
# cf. debian bug #87013
|
|
|
|
if (!defined($ptr{$a}) and !defined($ptr{$b})) {
|
|
$ptr{$a} = $a;
|
|
push @{$val{$a}}, $a, $b;
|
|
$ptr{$b} = $a;
|
|
$#{$val{$b}} = 0;
|
|
}
|
|
elsif (defined($ptr{$a}) and !defined($ptr{$b})) {
|
|
push @{$val{$ptr{$a}}}, $b;
|
|
$ptr{$b} = $ptr{$a};
|
|
$#{$val{$b}} = 0;
|
|
}
|
|
elsif (!defined($ptr{$a}) and defined($ptr{$b})) {
|
|
push @{$val{$ptr{$b}}}, $a;
|
|
$ptr{$a} = $ptr{$b};
|
|
$#{$val{$a}} = 0;
|
|
}
|
|
elsif ($ptr{$a} ne $ptr{$b}) {
|
|
my $valptrb = $val{$ptr{$b}};
|
|
push @{$val{$ptr{$a}}}, @{$valptrb};
|
|
for my $bkey (@{$valptrb}) {
|
|
$ptr{$bkey} = $ptr{$a};
|
|
}
|
|
$#$valptrb = 0;
|
|
# else $val{$a} is $val{$b} already
|
|
}
|
|
}
|
|
|
|
my $cnt = 0;
|
|
for my $k (keys %filelist, keys %fpcache) {
|
|
$set{$cnt} = $k if defined $set{$cnt};
|
|
$cnt++;
|
|
}
|
|
|
|
# FIXME: What is the proper format of collection files?
|
|
# It seems to be poorly defined, and gthumb parses them
|
|
# differently from gqview itself. In particular, gthumb
|
|
# seems to misparse comment lines, and the sort pragma
|
|
# it defines seems to be fragile wrt whitespace.
|
|
if ($collection) {
|
|
open(COLLECTION, "> $collection") or nqdie2("open(> $collection): $!\n");
|
|
select(COLLECTION);
|
|
|
|
print $collectionHeader;
|
|
for my $k (keys %ptr) {
|
|
next unless $ptr{$k} eq $k;
|
|
for ( @{$val{$ptr{$k}}} ) {
|
|
my $name = $set{$_};
|
|
|
|
if ( $name =~ /[\n"]/s ) {
|
|
nqwarn("excluded from $collection: $name\n");
|
|
}
|
|
else {
|
|
print qq{"$name"\n};
|
|
}
|
|
}
|
|
}
|
|
print $collectionFooter;
|
|
close(COLLECTION);
|
|
}
|
|
|
|
if ($script) {
|
|
open(COMMANDS, "> $script") or nqdie2("open(> $script): $!\n");
|
|
}
|
|
else {
|
|
open(COMMANDS, "| /bin/sh") or nqdie2("open(| /bin/sh): $!\n");
|
|
}
|
|
select(COMMANDS);
|
|
|
|
if ($script or $program or !$collection) {
|
|
$program = 'VIEW' unless $program;
|
|
print $scriptHeader;
|
|
printScriptFile if $scriptFile;
|
|
print @scriptCode if @scriptCode;
|
|
for my $k (keys %ptr) {
|
|
next unless $ptr{$k} eq $k;
|
|
print join(" \\\n\t",
|
|
$program,
|
|
( map { quotemeta $set{$_} } @{$val{$ptr{$k}}} ),
|
|
";\n"
|
|
);
|
|
}
|
|
print "\nEND;\n\n";
|
|
}
|
|
close(COMMANDS);
|
|
}
|
|
|
|
# ======================================================================
|
|
|
|
__DATA__
|
|
|
|
=encoding UTF-8
|
|
|
|
=head1 NAME
|
|
|
|
findimagedupes - Finds visually similar or duplicate images
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
findimagedupes [option ...] [--] [ - | [file ...] ]
|
|
|
|
Options:
|
|
-f, --fingerprints=FILE -c, --collection=FILE
|
|
-M, --merge=FILE -p, --program=PROGRAM
|
|
-P, --prune -s, --script=FILE
|
|
-a, --add -i, --include=TEXT
|
|
-r, --rescan -I, --include-file=FILE
|
|
-n, --no-compare
|
|
-q, --quiet
|
|
-t, --threshold=AMOUNT -v, --verbosity=LIST
|
|
|
|
-0, --null -h, --help
|
|
-R, --recurse --man
|
|
--version
|
|
|
|
With no options, compares the specified files and does not use nor
|
|
update any fingerprint database.
|
|
|
|
Directories of images may be specified instead of individual files;
|
|
Sub-directories of these are not searched unless --recurse is used.
|
|
|
|
=head1 INSTALLATION
|
|
|
|
If you use linux, your distribution may include a prepackaged version.
|
|
For example, Debian and Ubuntu do.
|
|
|
|
Otherwise, at a minimum you'll need Perl with the modules listed at the
|
|
top of the findimagedupes script. Also the GraphicksMagick package.
|
|
|
|
You may need to change Inline's C<DIRECTORY> to point somewhere else.
|
|
Read the Inline module documentation for details.
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 8
|
|
|
|
=item B<-0>, B<--null>
|
|
|
|
If a file C<-> is given, a list of files is read from stdin.
|
|
|
|
Without B<-0>, the list is specified one file per line, such as
|
|
produced by find(1) with its C<-print> option.
|
|
|
|
With B<-0>, the list is expected to be null-delimited, such as
|
|
produced by find(1) with its C<-print0> option.
|
|
|
|
=item B<-a>, B<--add>
|
|
|
|
Only look for duplicates of files specified on the commandline.
|
|
|
|
Matches are also sought in any fingerprint databases specified.
|
|
|
|
=item B<-c>, B<--collection>=I<FILE>
|
|
|
|
Create GQView collection I<FILE>.gqv of duplicates.
|
|
|
|
The program attempts to produce well-formed collections.
|
|
In particular, it will print a warning and exclude any file
|
|
whose name contains newline or doublequote. (In this situation,
|
|
gqview(1) seems to create a .gqv collection file that it
|
|
silently fails to read back in properly.)
|
|
|
|
=item B<-d>, B<--debug>=I<OPTS>
|
|
|
|
Enable debugging output. Options I<OPTS> are subject to change.
|
|
See the program source for details.
|
|
|
|
=item B<-f>, B<--fingerprints>=I<FILE>
|
|
|
|
Use I<FILE> as fingerprint database.
|
|
|
|
May be abbreviated as B<--fp> or B<--db>.
|
|
|
|
This option may be given multiple times when B<--merge> is used.
|
|
(Note: I<FILE> could contain commas, so multiple databases may
|
|
not be specified as a single comma-delimited list.)
|
|
|
|
=item B<-h>, B<--help>
|
|
|
|
Print usage and option sections of this manual, then exit.
|
|
|
|
=item B<-i>, B<--include>=I<TEXT>
|
|
|
|
I<TEXT> is Bourne-shell code to customise B<--script>.
|
|
|
|
It is executed after any code included using B<--include-file>.
|
|
|
|
May be given multiple times. Code will be concatenated.
|
|
|
|
=item B<-I>, B<--include-file>=I<FILE>
|
|
|
|
I<FILE> is a file containing Bourne-shell code to customise
|
|
B<--script>.
|
|
|
|
It is executed before any code included using B<--include>.
|
|
|
|
=item B<--man>
|
|
|
|
Display the full documentation, using default pager, then exit.
|
|
|
|
=item B<-M>, B<--merge>=I<FILE>
|
|
|
|
Takes any databases specified with B<--fingerprints>
|
|
and merges them into a new database called I<FILE>.
|
|
Conflicting fingerprints for an image will cause one of two actions to occur:
|
|
|
|
=over 4
|
|
|
|
=item 1.
|
|
|
|
If the image does not exist, then the entry is elided.
|
|
|
|
=item 2.
|
|
|
|
If the image does exist, then the old information is ignored
|
|
and a new fingerprint is generated from scratch.
|
|
|
|
=back
|
|
|
|
By default, image existence is not checked unless there is a conflict.
|
|
To force removal of defunct data, use B<--prune> as well.
|
|
|
|
A list of image files is not required if this option is used.
|
|
However, if a list is provided, fingerprint data for the files
|
|
will be copied or (re)generated as appropriate.
|
|
|
|
When B<--merge> is used, the original fingerprint databases are not modified,
|
|
even if B<--prune> is used.
|
|
|
|
If multiple fingerprint databases are to be used but the merge output is
|
|
not required, specify: B<--merge>=I</dev/null>
|
|
|
|
See also: B<--rescan>
|
|
|
|
=item B<-n>, B<--no-compare>
|
|
|
|
Don't look for duplicates.
|
|
|
|
=item B<-p>, B<--program>=I<PROGRAM>
|
|
|
|
Launch I<PROGRAM> (in foreground) to view each set of dupes.
|
|
|
|
I<PROGRAM> must be the full path to an existing executable file.
|
|
For more flexibility, see the B<--include> and B<--include-file>
|
|
options.
|
|
|
|
See also: B<--script>
|
|
|
|
=item B<-P>, B<--prune>
|
|
|
|
Remove fingerprint data for images that do not exist any more.
|
|
Has no effect unless B<--fingerprints> or B<--merge> is also used.
|
|
|
|
Databases specified by B<--fingerprints> are only modified if
|
|
B<--merge> is not used.
|
|
|
|
=item B<-q>, B<--quiet>
|
|
|
|
This option may be given multiple times.
|
|
|
|
Usually, progress, warning and error messages are printed on stderr.
|
|
If this option is given, warnings are not displayed.
|
|
If it is given twice or more, errors are not displayed either.
|
|
|
|
Information requested with B<--verbosity> is still displayed.
|
|
|
|
=item B<-R>, B<--recurse>
|
|
|
|
Use B<--recurse> to search recursively for images inside
|
|
subdirectories. For historical reasons, the default is to not do so.
|
|
To avoid looping, symbolic links to directories are never followed.
|
|
|
|
=item B<-r>, B<--rescan>
|
|
|
|
(Re)generate all fingerprints, not just any that are unknown.
|
|
|
|
If used with B<--add>, only the fingerprints of files specified
|
|
on the commandline are (re)generated.
|
|
|
|
Implies B<--prune>.
|
|
|
|
=item B<-s>, B<--script>=I<FILE>
|
|
|
|
When used with B<--program>, I<PROGRAM> is not launched immediately.
|
|
Instead sh(1)-style commands are saved to I<FILE>.
|
|
This script may be edited (if desired) and then executed manually.
|
|
|
|
When used without B<--program>, two skeletal shell functions
|
|
are generated: C<VIEW> simply echo(1)s its arguments;
|
|
the empty function C<END> runs after files-processing is finished.
|
|
|
|
To display to terminal (or feed into a pipe), use C<-> as I<FILE>.
|
|
|
|
If B<--script> is not given, the script is still created in memory and
|
|
is executed immediately. So, with the default VIEW and END functions,
|
|
lines containing sets of duplicates are displayed. See: B<EXAMPLES>
|
|
|
|
See also: B<--include>, B<--include-file>
|
|
|
|
=item B<-t>, B<--threshold>=I<AMOUNT>
|
|
|
|
Use I<AMOUNT> as threshold of similarity.
|
|
Append C<%> to give a percentage or C<b> for bits.
|
|
For backwards compatibility, a number with no unit is treated as
|
|
a percentage. Percentage is the minimum required for a match;
|
|
bits is the maximum that may differ: bits=floor(2.56(100-percent))
|
|
|
|
A fractional part may be given but it is only accurate to 100/256
|
|
(0.390625) for percentage and it is meaningless for C<bits>.
|
|
Default is C<90%> (C<25b>) if not specified.
|
|
|
|
=item B<-v>, B<--verbosity>=I<LIST>
|
|
|
|
Enable display of informational messages to stdout,
|
|
where I<LIST> is a comma-delimited list of:
|
|
|
|
=over 8
|
|
|
|
=item B<md5>
|
|
|
|
Display the checksum for each file, as per md5sum(1).
|
|
|
|
=item B<fingerprint> | B<fp>
|
|
|
|
Display the base64-encoded fingerprint of each file.
|
|
|
|
=back
|
|
|
|
Alternatively, B<--verbosity> may be given multiple times, and accumulates.
|
|
Note that this may not be sensible. For example, to be useful,
|
|
B<md5> output probably should not be merged with B<fingerprint> data.
|
|
|
|
=item B<--version>
|
|
|
|
Display the program version, then exit.
|
|
|
|
=back
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
B<findimagedupes> compares a list of files for visual similarity.
|
|
|
|
=over 1
|
|
|
|
=item To calculate an image fingerprint:
|
|
|
|
1) Read image.
|
|
2) Resample to 160x160 to standardize size.
|
|
3) Grayscale by reducing saturation.
|
|
4) Blur a lot to get rid of noise.
|
|
5) Normalize to spread out intensity as much as possible.
|
|
6) Equalize to make image as contrasty as possible.
|
|
7) Resample again down to 16x16.
|
|
8) Reduce to 1bpp.
|
|
9) The fingerprint is this raw image data.
|
|
|
|
=item To compare two images for similarity:
|
|
|
|
1) Take fingerprint pairs and xor them.
|
|
2) Compute the percentage of 1 bits in the result.
|
|
3) If percentage exceeds threshold, declare files to be similar.
|
|
|
|
=back
|
|
|
|
|
|
=head1 RETURN VALUE
|
|
|
|
=over 4
|
|
|
|
=item B<0>
|
|
|
|
Success.
|
|
|
|
=item B<1>
|
|
|
|
Usage information was requested (B<--help> or B<--man>), or there
|
|
were warnings.
|
|
|
|
=item B<2>
|
|
|
|
Invalid options or arguments were provided.
|
|
|
|
=item B<3>
|
|
|
|
Runtime error.
|
|
|
|
=back
|
|
|
|
Any other return values indicate an internal error of some sort.
|
|
|
|
=head1 DIAGNOSTICS
|
|
|
|
To be written.
|
|
|
|
=head1 EXAMPLES
|
|
|
|
=over 4
|
|
|
|
=item C<<<< findimagedupes -R -- . >>>>
|
|
|
|
Look for and compare images in all subdirectories of the current directory.
|
|
|
|
=item C<<<< find . -type f -print0 | findimagedupes -0 -- - >>>>
|
|
|
|
Same as above.
|
|
|
|
=item C<<<< findimagedupes -i 'echo "# sort: manual"' -i 'VIEW(){ for f in "$@"; do echo \"file://$f\"; done; }' -- *.jpg > dupes.gqv >>>>
|
|
|
|
Use script hooks to produce collection-style output
|
|
suitable for use with gthumb(1).
|
|
|
|
=back
|
|
|
|
=head1 FILES
|
|
|
|
To be written.
|
|
|
|
=head1 BUGS
|
|
|
|
There is a memory leak somewhere.
|
|
|
|
Killing the program may corrupt the fingerprint database(s).
|
|
|
|
The program does not lock the fingerprint database although concurrent
|
|
write access to it is unsafe.
|
|
|
|
GraphicsMagick does not expose its auto-orient functionality to Perl.
|
|
|
|
Changing version of GraphicsMagick invalidates fingerprint databases.
|
|
|
|
|
|
=head1 NOTES
|
|
|
|
Directory recursion is deliberately not implemented:
|
|
Composing a file-list and using it with C<-> is a more flexible approach.
|
|
|
|
Repetitions are culled before comparisons take place, so a commandline
|
|
like C<findimagedupes a.jpg a.jpg> will not produce a match.
|
|
|
|
The program needs a lot of memory. Probably not an issue, unless your
|
|
machine has less than 128MB of free RAM and you try to compare more than
|
|
a hundred-thousand files at once (and the program will run quite slowly
|
|
with that many files anyway---about eight hours initially to generate
|
|
fingerprints and another ten minutes to do the actual comparing).
|
|
|
|
Fingerprinting images is a bottleneck but unfortunately the program was
|
|
not written with parallel processing in mind. For a workaround, see:
|
|
https://github.com/jhnc/findimagedupes/issues/9
|
|
|
|
=head1 SEE ALSO
|
|
|
|
find(1), md5sum(1)
|
|
|
|
B<gqview> - GTK based multiformat image viewer
|
|
|
|
B<gthumb> - an image viewer and browser for GNOME
|
|
|
|
=head1 AUTHOR
|
|
|
|
Jonathan H N Chin <code@jhnc.org>
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
|
|
|
Copyright © 2006-2022 by Jonathan H N Chin <code@jhnc.org>.
|
|
|
|
This program is free software; you may redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
=head1 HISTORY
|
|
|
|
This code has been written from scratch. However it owes its existence
|
|
to B<findimagedupes> by Rob Kudla and uses the same duplicate-detection
|
|
algorithm.
|
|
|
|
=cut
|
|
|
|
__C__
|
|
|
|
/* efficient bit-comparison */
|
|
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
|
|
#define LOOKUP_SIZE 65536
|
|
#define FP_CHUNKS 16
|
|
|
|
typedef uint16_t FP[FP_CHUNKS];
|
|
|
|
void diffbits (SV* oldfiles, SV* newfiles, unsigned int threshold, unsigned limit) {
|
|
FP *the_data, *a, *b;
|
|
unsigned int lookup[LOOKUP_SIZE];
|
|
unsigned int i, j, k, m, bits, old, new;
|
|
HV *oldhash;
|
|
HE *oldhash_entry;
|
|
HV *newhash;
|
|
HE *newhash_entry;
|
|
unsigned int numkeys = 0;
|
|
SV *sv_val;
|
|
Inline_Stack_Vars;
|
|
|
|
if ((threshold<0) || (threshold>256)) {
|
|
croak("ridiculous threshold specified");
|
|
}
|
|
|
|
/* pack fingerprints into C array */
|
|
/* partly lifted from Inline::C-Cookbook */
|
|
|
|
if (! SvROK(newfiles)) {
|
|
croak("newfiles is not a reference");
|
|
}
|
|
newhash = (HV *)SvRV(newfiles);
|
|
new = hv_iterinit(newhash);
|
|
|
|
if (! SvROK(oldfiles)) {
|
|
croak("oldfiles is not a reference");
|
|
}
|
|
oldhash = (HV *)SvRV(oldfiles);
|
|
old = hv_iterinit(oldhash);
|
|
|
|
numkeys = new+old;
|
|
if (numkeys<2) {
|
|
/* minor optimization: return without doing anything */
|
|
/* malloc(0) could be bad... */
|
|
Inline_Stack_Void;
|
|
}
|
|
the_data = (FP *)malloc(numkeys*sizeof(FP));
|
|
if (!the_data) {
|
|
croak("malloc failed");
|
|
}
|
|
|
|
for (i = 0; i<new; i++) {
|
|
newhash_entry = hv_iternext(newhash);
|
|
sv_val = hv_iterval(newhash, newhash_entry);
|
|
memcpy(the_data+i, SvPV(sv_val, PL_na), sizeof(FP));
|
|
}
|
|
for (i = new; i<numkeys; i++) {
|
|
oldhash_entry = hv_iternext(oldhash);
|
|
sv_val = hv_iterval(oldhash, oldhash_entry);
|
|
memcpy(the_data+i, SvPV(sv_val, PL_na), sizeof(FP));
|
|
}
|
|
|
|
/* initialise lookup table */
|
|
/* cf. https://graphics.stanford.edu/~seander/bithacks.html */
|
|
for (i=0; i<LOOKUP_SIZE; i++) {
|
|
lookup[i] = lookup[i/2] + (i&1);
|
|
}
|
|
|
|
/* look for matches */
|
|
Inline_Stack_Reset;
|
|
for (a=the_data, i=0, m=(limit>0 ? new : numkeys-1); i<m; a++, i++) {
|
|
for (b=a+1, j=i+1; j<numkeys; b++, j++) {
|
|
for (bits=0, k=0; k<FP_CHUNKS; k++) {
|
|
bits += lookup[(*a)[k]^(*b)[k]];
|
|
if (bits > threshold) goto abortmatch;
|
|
}
|
|
/* if (bits <= threshold) */ {
|
|
Inline_Stack_Push(sv_2mortal(newSViv(i)));
|
|
Inline_Stack_Push(sv_2mortal(newSViv(j)));
|
|
Inline_Stack_Push(sv_2mortal(newSViv(bits)));
|
|
}
|
|
abortmatch:;
|
|
}
|
|
}
|
|
Inline_Stack_Done;
|
|
|
|
/* clean up */
|
|
free(the_data);
|
|
}
|
|
|