108 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			108 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/perl
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| pseudonymizr.pl - Unambiguous substitution of marked strings by randomized ones
 | |
| 
 | |
| =head1 USE CASE
 | |
| 
 | |
| Societies that consider to use treasuredb for their finances are
 | |
| strongly advised to test it in advance. The board members may do that
 | |
| themselves, but they might want to delegate that to competent regular
 | |
| members. However, in the latter case, privacy laws are to be observed that
 | |
| can prohibit disclosure of real data for this purpose.
 | |
| 
 | |
| To conduct tests with real data without infringement of privacy laws,
 | |
| this script has been developped. It replaces all names and other sensible
 | |
| information that is specially marked, by a random string. It is guaranteed
 | |
| that repeatedly occurring strings are replaced by the same pseudonym,
 | |
| of course.
 | |
| 
 | |
| =head1 HOW IT WORKS
 | |
| 
 | |
| This script pseudonymizes lines from standard input and writes them to
 | |
| standard output. All strings that need to be replaced must be wrapped into
 | |
| a special marker, namely X{...}, where X can be any upper- or lowercase
 | |
| letter of the alphabet, denoting a certain category of information, e.g.
 | |
| "M" for member names.
 | |
| 
 | |
| Caution with natural member names! Slightly differing content of M{...}
 | |
| clauses lead to completely different pseudonyms. In order not to render
 | |
| the tests irreversible and in disaccord with actual states, because the
 | |
| association of members and accounts is wrong, you should prefer using it
 | |
| for the unique and standard member ID, say the number in the member table.
 | |
| Where proper names in financial transfer information, use a different letter. 
 | |
| 
 | |
| =head1 COMMAND
 | |
| 
 | |
|  pseudonymizr.pl [-r] secret_registry.txt < in_file.csv > out_file.csv
 | |
| 
 | |
| =head1 FLAGS
 | |
| 
 | |
| =over 4
 | |
| 
 | |
| =item -r
 | |
| 
 | |
| reverse mode, i.e. de-pseudonymize
 | |
| 
 | |
| =back
 | |
| 
 | |
| =cut
 | |
| 
 | |
| use strict;
 | |
| use utf8;
 | |
| 
 | |
| my $LENGTH = 5;
 | |
| my @CHARS = ( 'A' .. 'Z', 'a' .. 'z', 0 .. 9 );
 | |
| 
 | |
| my (%subst, %known, $registry_fh);
 | |
| my $reverse_mode = $ARGV[0] eq q{-r} && shift;
 | |
| my $registry_file = shift;
 | |
| open $registry_fh, '<:utf8', $registry_file
 | |
|     or die "Cannot read $registry_file: $!"
 | |
|     if -e $registry_file;
 | |
| 
 | |
| if ( $registry_fh ) {
 | |
|     
 | |
|     my ($h1, $h2) = $reverse_mode ? (\%subst => \%known) : (\%known => \%subst);
 | |
|     while ( $_ = <$registry_fh> ) {
 | |
|         chomp;
 | |
|         my ($random, $orig) = split /\t/;
 | |
|         $h1->{ $random } = $orig;
 | |
|     }
 | |
|     %$h2 = reverse %$h1;
 | |
| 
 | |
| }
 | |
| 
 | |
| binmode STDIN, ':utf8';
 | |
| binmode STDOUT, ':utf8';
 | |
| while ( $_ = <STDIN> ) {
 | |
|     s/ (?<=\w\{) (.+?) (?=\}) / pseudonymize($1) /gexms;
 | |
|     print;
 | |
| }
 | |
| 
 | |
| unless ( $reverse_mode ) {
 | |
|    # Rewrite the registry we could have expanded with new names (sorted)
 | |
| 
 | |
|    open $registry_fh, '|-', qq{sort > $registry_file}
 | |
|        or die "Cannot pipe to sort: $!";
 | |
| 
 | |
|    while ( my ($random, $orig) = each %known ) {
 | |
|        printf $registry_fh "%s\t%s\n", $random, $orig;
 | |
|    }
 | |
| 
 | |
| }
 | |
| 
 | |
| sub pseudonymize { my $orig = shift; $subst{ $orig } //= do {{
 | |
| 
 | |
|     # Make random string with $LENGTH characters from @CHARS
 | |
|     my $random = join q{}, map { $CHARS[ rand @CHARS ] } 1 .. $LENGTH;
 | |
| 
 | |
|     # If known, try anew (p = 1 : @CHARS ^ $LENGTH, i.e. p > 0)
 | |
|     $_ = defined($_) ? redo : $orig for $known{ $random };
 | |
| 
 | |
|     # return to %subst cache
 | |
|     $random;
 | |
| 
 | |
| }}; }       
 | 
