#! /usr/bin/env perl # fdupesGreatestSavings # # Filter which takes the output of fdupes --size and reports on which duplicated # files clean up will result in the greatest savings. # # Usage: fdupes --size --recurse / | fdupesGreatestSavings 100 # # Copyright 2024 Daily Data, Inc. # # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following # conditions are met: # # Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer # in the documentation and/or other materials provided with the distribution. # Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT # NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL # THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. use strict; use warnings; use Data::Dumper; # takes a path and returns an array of the path for each subdirectory sub parseDirectory { use File::Basename; my $separator = '/'; my @result; my @temp = split( $separator, dirname( shift ) ); my $curpath = ''; for ( my $i = 0; $i < scalar( @temp ); $i++ ) { $curpath .= ($curpath ? $separator : '' ) . $temp[$i]; push @result, $curpath; } return \@result; } my $maxCount = shift; die "Enter maximum entries to show\n" unless $maxCount; my $entry = 0; # just a simple index into our data my %data; # hash for all our data my $files = (); # temporary array for our file list while ( my $line = <> ) { chomp $line; if ( $line =~ m/^(\d+) bytes each:$/ ) { # new entry $data{++$entry}{'size'} = $1; $data{$entry}{'files'} = []; } elsif ( $line =~ m/^\s*$/ ) { # blank line, so get summary $data{$entry}{'total'} = $data{$entry}{'size'} * scalar( @{$data{$entry}{'files'}} ); } else { # this should be a file name push @{ $data{$entry}{'files'} }, $line; } } #print Dumper( \%data ) . "\n"; die; foreach my $thisEntry ( sort{ $data{$b}{'total'} <=> $data{$a}{'total'} } keys %data ) { last unless $maxCount--; my $numCopies = scalar( @{$data{$thisEntry}{'files'}} ); print "$data{$thisEntry}{total} bytes wasted in following " . $numCopies . " files of $data{$thisEntry}{size} bytes\n"; print "\t" . join( "\n\t", @{$data{$thisEntry}{'files'}} ) . "\n\n"; } exit 1; my %directories; foreach my $thisEntry ( keys %data ) { foreach my $file ( @{$data{$thisEntry}{'files'}} ) { my $parsedDir = &parseDirectory( $file ); foreach my $thisDir( @$parsedDir ) { $directories{$thisDir}{'size'} += $data{$thisEntry}{'size'} * 1; push @{ $directories{$thisDir}{'files'} }, $file; } } } foreach my $thisEntry ( sort{ $data{$b}{'size'} <=> $data{$a}{'size'} } keys %directories ) { last unless $maxCount--; print "$directories{$thisEntry}{size}\t$thisEntry\n\t"; print join( "\n\t", @{ $directories{$thisEntry}{'files'} } ) . "\n\n"; } 1;