Increased performance by using parallelStream instead of manual threading, removed -t/--threads option, updated README, bumped version to 3.0

This commit is contained in:
Julian Müller (ChaoticByte) 2023-05-07 18:22:40 +02:00
parent 2770390df3
commit 37d7b5bd06
5 changed files with 89 additions and 142 deletions

View file

@ -18,9 +18,6 @@ Note that xxHash is not a cryptographic hash function and therefore may produce
usage: xxSherly.jar [options] folder1 folder2 ...
-c,--color enable colored output
-h,--help show this help message
-p,--progress enable progress indicator
-t,--threads <arg> override default thread number (defaults to the
number of cores)
-v,--verbose more verbose output
```
@ -44,15 +41,19 @@ mvn package assembly:single
I let Sherly and xxSherly find duplicates in my Music Library (containing `.wav` files) using the following commands:
```bash
# Sherly v1.1.4
time java -jar Bin/sherly.jar -n -f ~/Music/
time java -jar target/xxSherly-x.y-jar-with-dependencies.jar -n -f ~/Music/
# xxSherly v2.1
time java -jar target/xxSherly-2.1-jar-with-dependencies.jar ~/Music/
# xxSherly v3.0
time java -jar target/xxSherly-3.0-jar-with-dependencies.jar ~/Music/
```
The timings are measured using the Linux tool `time` (`real`).
| | Sherly v1.1.4 | xxSherly v1.0 |
| --------: | ------------: | --------------: |
| 1st run | 4.055s | 2.561s |
| 2nd run | 4.055s | 2.304s |
| 3rd run | 4.066s | 2.549s |
| **avg** | **4.059s** | **2.471s** |
| | Sherly v1.1.4 | xxSherly v2.1 | xxSherly v3.0 |
| --------: | ------------: | ------------: | ------------: |
| 1st run | 4.055s | 2.554s | 2.086s |
| 2nd run | 4.055s | 2.554s | 2.109s |
| 3rd run | 4.066s | 2.556s | 2.092s |
| **avg** | **4.059s** | **2.555s** | **2.096s** |

View file

@ -6,7 +6,7 @@
<groupId>net.chaoticbyte.xxsherly</groupId>
<artifactId>xxSherly</artifactId>
<version>2.1</version>
<version>3.0</version>
<name>xxSherly</name>
<!-- FIXME change it to the project's website -->

View file

@ -7,7 +7,6 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.cli.CommandLine;
@ -20,29 +19,22 @@ public class App {
public static final String usageHelp = "xxSherly.jar [options] folder1 folder2 ...";
public static int completedThreads = 0;
public static int progress = 0;
public static HashMap<String, List<File>> fileMap = new HashMap<>();
public static boolean doTheColorThingy = false;
public static boolean verbose = false;
public static void main(String[] args) throws InterruptedException {
// Arguments
List<File> folderList = new ArrayList<>();
boolean showProgress = false;
boolean verbose = false;
boolean displayHelp = false;
int requestedThreads = 0;
// CLI
List<File> folderList = new ArrayList<>();
boolean displayHelp = false;
HelpFormatter helpFormatter = new HelpFormatter();
Options commandlineOptions = new Options();
commandlineOptions.addOption("c", "color", false, "enable colored output");
commandlineOptions.addOption("t", "threads", true, "override default thread number (defaults to the number of cores)");
commandlineOptions.addOption("p", "progress", false, "enable progress indicator");
commandlineOptions.addOption("v", "verbose", false, "more verbose output");
commandlineOptions.addOption("h", "help", false, "show this help message");
@ -56,10 +48,8 @@ public class App {
}
// Get arguments & options
doTheColorThingy = arguments.hasOption("c");
showProgress = arguments.hasOption("p");
verbose = arguments.hasOption("v");
displayHelp = arguments.hasOption("h");
requestedThreads = Integer.parseInt(arguments.getOptionValue("t", "0"));
}
catch (ParseException | NumberFormatException e) {
helpFormatter.printHelp(usageHelp, commandlineOptions);
@ -83,16 +73,8 @@ public class App {
System.out.println("Arguments:");;
System.out.println(" Folders: " + folderList.size());
System.out.println(" Color: " + doTheColorThingy);
System.out.println(" Progress: " + showProgress);
}
// Calculations for multithreading
// The number of Cores or better said Threads that can be used
int availableProcessors = Runtime.getRuntime().availableProcessors();
int nThreads = availableProcessors;
if (requestedThreads > 0) nThreads = requestedThreads;
if (verbose) System.out.println("Threads: " + nThreads);
// Find all files
List<File> files = new ArrayList<>();
for (File folder : folderList) {
@ -114,33 +96,29 @@ public class App {
int nFiles = files.size();
if (verbose) System.out.println("Files: " + nFiles);
// Every Thread that is going to be started gets a range of files
// They are seperated and are called sections
int sections = nFiles / nThreads;
for (int i = 1; i <= nThreads; i++) {
List<File> sectionedList = new ArrayList<>();
// Here the different Threads are being started
// Usually the separation gives the first threads the same number of files to be working on and the last one is given all the files that could not be separetated
if (i == nThreads) for (int x = (sections * i) - (sections); x < nFiles; x++) {
sectionedList.add(files.get(x));
} else for (int x = (sections * i) - (sections); x < (sections * i); x++) {
sectionedList.add(files.get(x));
}
// Start Multithreading
// sectionedList gives the thread their Assigned Part of Files
ThreadedCompare threadedCompare = new ThreadedCompare(sectionedList);
threadedCompare.start();
}
// Calculate Hashes
// This updates if necessary the Progress bar and checks for Finished threads
while (completedThreads < nThreads) {
TimeUnit.MILLISECONDS.sleep(250);
if (showProgress && doTheColorThingy) {
System.out.print(ConsoleColors.BLUE_BOLD + "Progress: " + ConsoleColors.GREEN_BOLD + progress + " / " + nFiles + " | " + (progress * 100 / nFiles) + "%" + ConsoleColors.RESET + "\r");
} else if (showProgress) {
System.out.print("Progress: " + progress + " / " + nFiles + " | " + (progress * 100 / nFiles) + "%" + "\r");
files.parallelStream().forEach(file -> {
List<File> fileArray = new ArrayList<>();
assert fileArray != null;
fileArray.add(file);
// Generate Checksum
try {
String checksum = FileChecksum.getChecksum(file);
if (App.fileMap.containsKey(checksum)) {
fileArray.addAll(App.fileMap.get(checksum));
App.fileMap.put(checksum, fileArray);
} else {
App.fileMap.put(checksum, fileArray);
}
}
catch (IOException e) {
System.err.println("An exception occured while processing the file " + file.getPath());
System.err.println(e.getMessage());
}
});
ArrayList<String> toRemove = new ArrayList<String>();
for (String checksum: fileMap.keySet()) {
@ -152,7 +130,8 @@ public class App {
// Now everything is finished and the Filemap (hashmap with all Dups) can be printed out in a nice view
if (fileMap.size() > 0) System.out.println();
if (fileMap.size() > 0) {
System.out.println();
for (String checksum: fileMap.keySet()) {
if (doTheColorThingy) {
System.out.println(
@ -162,22 +141,25 @@ public class App {
+ ConsoleColors.RESET);
} else System.out.println(checksum +"\t--> " + fileMap.get(checksum));
}
if (fileMap.size() > 0) System.out.println();
System.out.println();
}
List<File> toBeDeleted = new ArrayList<>();
// Count redundant files and bytes
int toBeDeleted = 0;
long bytes = 0;
for (String checksum: fileMap.keySet()) {
App.fileMap.get(checksum).remove(0);
for (File file: App.fileMap.get(checksum)) {
if (file != null) bytes += file.length();
}
toBeDeleted.addAll(App.fileMap.get(checksum));
toBeDeleted++;
}
if (doTheColorThingy) {
String color = ConsoleColors.RED_BOLD;
if (fileMap.size() < 1) color = ConsoleColors.GREEN_BOLD;
System.out.println(color + (bytes / 1000000.0) + " unnecessary MB in " + toBeDeleted.size() + " file(s) found." + ConsoleColors.RESET);
} else System.out.println((bytes / 1000000.0) + " unnecessary MB in " + toBeDeleted.size() + " file(s) found.");
System.out.println(color + (bytes / 1000000.0) + " redundant MB in " + toBeDeleted + " file(s) found." + ConsoleColors.RESET);
} else System.out.println((bytes / 1000000.0) + " redundant MB in " + toBeDeleted + " file(s) found.");
}
}

View file

@ -0,0 +1,31 @@
package net.chaoticbyte.xxsherly;
import java.io.*;
import java.util.zip.Checksum;
import org.apache.commons.codec.digest.XXHash32;
public class FileChecksum {
//this is used to get the MD5 String of one of the files (one of them is just fine since they both have the same value)
public static String getChecksum (File file) throws IOException {
String digest = "";
// Calculate xxHash32 and add it's hexadecimal presentation to the digest
Checksum xxHash = new XXHash32();
FileInputStream inputStream = new FileInputStream(file);
byte[] dataBytes = new byte[1024];
int unread = 0;
while ((unread = inputStream.read(dataBytes)) != -1) {
xxHash.update(dataBytes, 0, unread);
}
inputStream.close();
digest += Long.toHexString(xxHash.getValue());
// Add File length to the digest
digest += Long.toHexString(file.length());
// return result
return digest;
}
}

View file

@ -1,67 +0,0 @@
package net.chaoticbyte.xxsherly;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.Checksum;
import org.apache.commons.codec.digest.XXHash32;
public class ThreadedCompare extends Thread {
private final List<File> filesToCompare;
public ThreadedCompare (List<File> pathsToCompare_) {
this.filesToCompare = pathsToCompare_;
}
@Override
public void run() {
for (File file : filesToCompare) {
List<File> fileArray = new ArrayList<>();
assert fileArray != null;
fileArray.add(file);
// Generate Checksum
try {
String checksum = getChecksum(file);
if (App.fileMap.containsKey(checksum)) {
fileArray.addAll(App.fileMap.get(checksum));
App.fileMap.put(checksum, fileArray);
} else {
App.fileMap.put(checksum, fileArray);
}
}
catch (IOException e) {
System.err.println("An exception occured while processing the file " + file.getPath());
System.err.println(e.getMessage());
}
App.progress++;
}
App.completedThreads++;
}
//this is used to get the MD5 String of one of the files (one of them is just fine since they both have the same value)
private String getChecksum (File file) throws IOException {
String digest = "";
// Calculate xxHash32 and add it's hexadecimal presentation to the digest
Checksum xxHash = new XXHash32();
FileInputStream inputStream = new FileInputStream(file);
byte[] dataBytes = new byte[1024];
int unread = 0;
while ((unread = inputStream.read(dataBytes)) != -1) {
xxHash.update(dataBytes, 0, unread);
}
inputStream.close();
digest += Long.toHexString(xxHash.getValue());
// Add File length to the digest
digest += Long.toHexString(file.length());
// return result
return digest;
}
}