Compare commits

...

4 commits
v2.0 ... main

6 changed files with 101 additions and 199 deletions

View file

@ -1,6 +1,7 @@
# xxSherly
A fork of [Sherly](https://github.com/BlyDoesCoding/Sherly), using [xxHash](https://github.com/Cyan4973/xxHash).
A fork of [Sherly](https://github.com/BlyDoesCoding/Sherly), using [xxHash](https://github.com/Cyan4973/xxHash).
This fork is faster, but has less features and may produce false-positives.
![](./images/screenshot.png)
@ -8,21 +9,16 @@ A fork of [Sherly](https://github.com/BlyDoesCoding/Sherly), using [xxHash](http
Sherly is a Multithreaded Duplicate File Finder for your Terminal, written in java. You can Easily find duplicate Images, videos as well as any other type of Data. That can be helpful if you run on small storage or just want to keep regular housekeeping.
This fork uses [xxHash](https://github.com/Cyan4973/xxHash) instead of MD5 for performance reasons (see [Speed comparison](#speed-comparison)).
Note that xxHash is not a cryptographic hash function and therefore may produce collisions. That's why the checksum is composed of the xxHash Digest and the filesize.
Instead of md5, this fork uses [xxHash](https://github.com/Cyan4973/xxHash) + the filesize to find duplicates, for performance reasons (see [Speed comparison](#speed-comparison)).
Note that xxHash is not a cryptographic hash function and therefore may produce collisions (false-positives). For this reason, since version 2.1, the program no longer offers the option to delete duplicates. You should delete them by yourself.
## Usage
```
usage: xxSherly.jar [options] folder1 folder2 ...
-c,--color enable colored output
-d,--delete delete all dups except one, without asking first
-h,--help show this help message
-n,--noinput skip all user input
-p,--progress enable progress indicator
-t,--threads <arg> override default thread number (defaults to the
number of cores)
-v,--verbose more verbose output
-c,--color enable colored output
-h,--help show this help message
-v,--verbose more verbose output
```
## Build
@ -42,18 +38,22 @@ mvn package assembly:single
## Speed comparison
I let Sherly v1.1.4 and xxSherly v1.0 find duplicates in my Music Library (containing `.wav` files) using the following commands:
I let Sherly and xxSherly find duplicates in my Music Library (containing `.wav` files) using the following commands:
```bash
# Sherly v1.1.4
time java -jar Bin/sherly.jar -n -f ~/Music/
time java -jar target/xxSherly-1.0-jar-with-dependencies.jar -n -f ~/Music/
# xxSherly v2.1
time java -jar target/xxSherly-2.1-jar-with-dependencies.jar ~/Music/
# xxSherly v3.0
time java -jar target/xxSherly-3.0-jar-with-dependencies.jar ~/Music/
```
The timings are measured using the Linux tool `time` (`real`).
| | Sherly | xxSherly |
| --------: | ------------: | --------------: |
| 1st run | 4.055s | 2.561s |
| 2nd run | 4.055s | 2.304s |
| 3rd run | 4.066s | 2.549s |
| **avg** | **4.059s** | **2.471s** |
| | Sherly v1.1.4 | xxSherly v2.1 | xxSherly v3.0 |
| --------: | ------------: | ------------: | ------------: |
| 1st run | 4.055s | 2.554s | 2.086s |
| 2nd run | 4.055s | 2.554s | 2.109s |
| 3rd run | 4.066s | 2.556s | 2.092s |
| **avg** | **4.059s** | **2.555s** | **2.096s** |

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Before After
Before After

View file

@ -6,7 +6,7 @@
<groupId>net.chaoticbyte.xxsherly</groupId>
<artifactId>xxSherly</artifactId>
<version>2.0</version>
<version>3.0</version>
<name>xxSherly</name>
<!-- FIXME change it to the project's website -->

View file

@ -5,10 +5,8 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Scanner;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.cli.CommandLine;
@ -21,33 +19,20 @@ public class App {
public static final String usageHelp = "xxSherly.jar [options] folder1 folder2 ...";
public static int completedThreads = 0;
public static int progress = 0;
public static HashMap<String, List<File>> fileMap = new HashMap<>();
public static boolean doTheColorThingy = false;
public static boolean verbose = false;
public static void main(String[] args) throws InterruptedException {
// Arguments
List<File> folderList = new ArrayList<>();
boolean showProgress = false;
boolean deleteDups = false;
boolean verbose = false;
boolean noInput = false;
boolean displayHelp = false;
int requestedThreads = 0;
// CLI
List<File> folderList = new ArrayList<>();
boolean displayHelp = false;
HelpFormatter helpFormatter = new HelpFormatter();
Options commandlineOptions = new Options();
commandlineOptions.addOption("c", "color", false, "enable colored output");
commandlineOptions.addOption("t", "threads", true, "override default thread number (defaults to the number of cores)");
commandlineOptions.addOption("p", "progress", false, "enable progress indicator");
commandlineOptions.addOption("d", "delete", false, "delete all dups except one, without asking first");
commandlineOptions.addOption("n", "noinput", false, "skip all user input");
commandlineOptions.addOption("v", "verbose", false, "more verbose output");
commandlineOptions.addOption("h", "help", false, "show this help message");
@ -61,12 +46,8 @@ public class App {
}
// Get arguments & options
doTheColorThingy = arguments.hasOption("c");
showProgress = arguments.hasOption("p");
deleteDups = arguments.hasOption("d");
verbose = arguments.hasOption("v");
noInput = arguments.hasOption("n");
displayHelp = arguments.hasOption("h");
requestedThreads = Integer.parseInt(arguments.getOptionValue("t", "0"));
}
catch (ParseException | NumberFormatException e) {
helpFormatter.printHelp(usageHelp, commandlineOptions);
@ -90,17 +71,8 @@ public class App {
System.out.println("Arguments:");;
System.out.println(" Folders: " + folderList.size());
System.out.println(" Color: " + doTheColorThingy);
System.out.println(" Delete: " + deleteDups);
System.out.println(" Progress: " + showProgress);
}
// Calculations for multithreading
// The number of Cores or better said Threads that can be used
int availableProcessors = Runtime.getRuntime().availableProcessors();
int nThreads = availableProcessors;
if (requestedThreads > 0) nThreads = requestedThreads;
if (verbose) System.out.println("Threads: " + nThreads);
// Find all files
List<File> files = new ArrayList<>();
for (File folder : folderList) {
@ -122,37 +94,35 @@ public class App {
int nFiles = files.size();
if (verbose) System.out.println("Files: " + nFiles);
// Every Thread that is going to be started gets a range of files
// They are seperated and are called sections
int sections = nFiles / nThreads;
for (int i = 1; i <= nThreads; i++) {
List<File> sectionedList = new ArrayList<>();
// Here the different Threads are being started
// Usually the separation gives the first threads the same number of files to be working on and the last one is given all the files that could not be separetated
if (i == nThreads) for (int x = (sections * i) - (sections); x < nFiles; x++) {
sectionedList.add(files.get(x));
} else for (int x = (sections * i) - (sections); x < (sections * i); x++) {
sectionedList.add(files.get(x));
}
// Start Multithreading
// sectionedList gives the thread their Assigned Part of Files
ThreadedCompare threadedCompare = new ThreadedCompare(sectionedList);
threadedCompare.start();
}
// Calculate Hashes
// This updates if necessary the Progress bar and checks for Finished threads
while (completedThreads < nThreads) {
TimeUnit.MILLISECONDS.sleep(250);
if (showProgress && doTheColorThingy) {
System.out.print(ConsoleColors.BLUE_BOLD + "Progress: " + ConsoleColors.GREEN_BOLD + progress + " / " + nFiles + " | " + (progress * 100 / nFiles) + "%" + ConsoleColors.RESET + "\r");
} else if (showProgress) {
System.out.print("Progress: " + progress + " / " + nFiles + " | " + (progress * 100 / nFiles) + "%" + "\r");
ConcurrentHashMap<String, List<File>> fileMap = new ConcurrentHashMap<>();
files.parallelStream().forEach(file -> {
List<File> fileArray = new ArrayList<>();
assert fileArray != null;
fileArray.add(file);
// Generate Checksum
try {
String checksum = FileChecksum.getChecksum(file);
if (fileMap.containsKey(checksum)) {
fileArray.addAll(fileMap.get(checksum));
fileMap.put(checksum, fileArray);
} else {
fileMap.put(checksum, fileArray);
}
}
}
catch (IOException e) {
System.err.println("An exception occured while processing the file " + file.getPath());
System.err.println(e.getMessage());
}
});
ArrayList<String> toRemove = new ArrayList<String>();
for (String checksum: fileMap.keySet()) {
if (App.fileMap.get(checksum).size() == 1) {
if (fileMap.get(checksum).size() == 1) {
toRemove.add(checksum);
}
}
@ -160,68 +130,36 @@ public class App {
// Now everything is finished and the Filemap (hashmap with all Dups) can be printed out in a nice view
if (fileMap.size() > 0) System.out.println();
for (String checksum: fileMap.keySet()) {
if (doTheColorThingy) {
System.out.println(
ConsoleColors.BLUE_BOLD + checksum
+ ConsoleColors.CYAN_BOLD + "\t--> "
+ ConsoleColors.GREEN_BOLD + fileMap.get(checksum)
+ ConsoleColors.RESET);
} else System.out.println(checksum +"\t--> " + fileMap.get(checksum));
if (fileMap.size() > 0) {
System.out.println();
for (String checksum: fileMap.keySet()) {
if (doTheColorThingy) {
System.out.println(
ConsoleColors.BLUE_BOLD + checksum
+ ConsoleColors.CYAN_BOLD + "\t--> "
+ ConsoleColors.GREEN_BOLD + fileMap.get(checksum)
+ ConsoleColors.RESET);
} else System.out.println(checksum +"\t--> " + fileMap.get(checksum));
}
System.out.println();
}
if (fileMap.size() > 0) System.out.println();
List<File> toBeDeleted = new ArrayList<>();
// Count redundant files and bytes
int toBeDeleted = 0;
long bytes = 0;
for (String checksum: fileMap.keySet()) {
App.fileMap.get(checksum).remove(0);
for (File file: App.fileMap.get(checksum)) {
fileMap.get(checksum).remove(0);
for (File file: fileMap.get(checksum)) {
if (file != null) bytes += file.length();
}
toBeDeleted.addAll(App.fileMap.get(checksum));
toBeDeleted++;
}
if (doTheColorThingy) {
String color = ConsoleColors.RED_BOLD;
if (fileMap.size() < 1) color = ConsoleColors.GREEN_BOLD;
System.out.println(color + (bytes / 1000000.0) + " unnecessary MB in " + toBeDeleted.size() + " file(s) found." + ConsoleColors.RESET);
} else System.out.println((bytes / 1000000.0) + " unnecessary MB in " + toBeDeleted.size() + " file(s) found.");
// Don't go further if there is nothing to delete
if (fileMap.size() < 1) return;
if (deleteDups) {
System.out.println();
delete(toBeDeleted);
} else if (!noInput) {
// Ask if the user wants to delete the file
Scanner input = new Scanner(System.in);
while (true) {
if (doTheColorThingy) System.out.print(ConsoleColors.RED_BOLD + "Do you want to delete them? [y/n] " + ConsoleColors.RESET);
else System.out.print("Do you want to delete them? [y/n] ");
String answer = input.next();
if (answer.toLowerCase().contains("y")) {
System.out.println();
delete(toBeDeleted);
break;
}
else if (answer.toLowerCase().contains("n")) break;
}
input.close();
}
}
public static void delete(List<File> fileList) {
for (File file : fileList) if (file != null) {
if (file.delete()) {
if (doTheColorThingy) System.out.println(ConsoleColors.RED_BOLD + "Deleted " + file.toPath() + ConsoleColors.RESET);
else System.out.println("Deleted " + file.toPath());
}
else {
if (doTheColorThingy) System.err.println(ConsoleColors.RED_BOLD + "Couldn't delete " + ConsoleColors.RESET + file.toPath());
else System.err.println("Couldn't delete " + file.toPath());
}
}
if (toBeDeleted < 1) color = ConsoleColors.GREEN_BOLD;
System.out.println(color + (bytes / 1000000.0) + " redundant MB in " + toBeDeleted + " file(s) found." + ConsoleColors.RESET);
} else System.out.println((bytes / 1000000.0) + " redundant MB in " + toBeDeleted + " file(s) found.");
}
}

View file

@ -0,0 +1,31 @@
package net.chaoticbyte.xxsherly;
import java.io.*;
import java.util.zip.Checksum;
import org.apache.commons.codec.digest.XXHash32;
public class FileChecksum {
//this is used to get the MD5 String of one of the files (one of them is just fine since they both have the same value)
public static String getChecksum (File file) throws IOException {
String digest = "";
// Calculate xxHash32 and add it's hexadecimal presentation to the digest
Checksum xxHash = new XXHash32();
FileInputStream inputStream = new FileInputStream(file);
byte[] dataBytes = new byte[1024];
int unread = 0;
while ((unread = inputStream.read(dataBytes)) != -1) {
xxHash.update(dataBytes, 0, unread);
}
inputStream.close();
digest += Long.toHexString(xxHash.getValue());
// Add File length to the digest
digest += Long.toHexString(file.length());
// return result
return digest;
}
}

View file

@ -1,67 +0,0 @@
package net.chaoticbyte.xxsherly;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.Checksum;
import org.apache.commons.codec.digest.XXHash32;
public class ThreadedCompare extends Thread {
private final List<File> filesToCompare;
public ThreadedCompare (List<File> pathsToCompare_) {
this.filesToCompare = pathsToCompare_;
}
@Override
public void run() {
for (File file : filesToCompare) {
List<File> fileArray = new ArrayList<>();
assert fileArray != null;
fileArray.add(file);
// Generate Checksum
try {
String checksum = getChecksum(file);
if (App.fileMap.containsKey(checksum)) {
fileArray.addAll(App.fileMap.get(checksum));
App.fileMap.put(checksum, fileArray);
} else {
App.fileMap.put(checksum, fileArray);
}
}
catch (IOException e) {
System.err.println("An exception occured while processing the file " + file.getPath());
System.err.println(e.getMessage());
}
App.progress++;
}
App.completedThreads++;
}
//this is used to get the MD5 String of one of the files (one of them is just fine since they both have the same value)
private String getChecksum (File file) throws IOException {
String digest = "";
// Calculate xxHash32 and add it's hexadecimal presentation to the digest
Checksum xxHash = new XXHash32();
FileInputStream inputStream = new FileInputStream(file);
byte[] dataBytes = new byte[1024];
int unread = 0;
while ((unread = inputStream.read(dataBytes)) != -1) {
xxHash.update(dataBytes, 0, unread);
}
inputStream.close();
digest += Long.toHexString(xxHash.getValue());
// Add File length to the digest
digest += Long.toHexString(file.length());
// return result
return digest;
}
}