Skip to content
This repository was archived by the owner on Jun 17, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*text=auto
9 changes: 5 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.DS_Store
.idea
venv
__pycache__
**/.DS_Store
**/.idea
**/venv
**/__pycache__
**/.vscode
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
# PyFile-Scripts

## About
A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see LICENSE).

A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see [LICENSE](LICENSE)).

## Tools available:
- Duplicate_Searcher (testing) — Advanced multithread file duplates searcher. Documentation is available [here](docs/duplicate_searcher.md).

- Duplicate Finder (β) — Advanced multithread file duplates searcher. [More…](duplicate_finder/README.md)
- Content Searcher (α) — Search for files using their content. [More…](content_searcher/README.md)

## Contributors
Everyone is allowed to open issues, suggest changes, or change code and add pull requests. Any kind of help will be highly appreciated.
#### Developers:
- [Formak21](https://github.com/Formak21) (Original Creator)
- [German Ivanov](https://github.com/germanivanov0719)

Everyone is allowed to open issues, suggest changes, fork this project, modify code, and add pull requests. Any kind of help will be highly appreciated.

#### Developers:

- [Formak21](https://github.com/Formak21) (Original Author, Duplicate Searcher, Duplicate Searcher Alternative)
- [German Ivanov](https://github.com/germanivanov0719) (Content Searcher, other contributions)
65 changes: 65 additions & 0 deletions content_searcher/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Content Searcher

## About

This program helps you to quickly search for any string in all files in a given directory. It uses multithreading to speed up the search, and also supports multiple encodings.

## How to use:

### Input

You can provide path and search query in this order as a terminal arguments, or input them during runtime. Use any path to the directory, as supported by your OS.

### Output

Apart from progress information, the program outputs results in the following format:

```
/full/path/to/file1
/full/path/to/file2
...
/full/path/to/fileN

Total: N
```

### Customizations

You can change some settings in `main.py`. These are the defaults:

```python
# User-defined settings
RECURSION_LIMIT = 1000
LOGGING = False
SILENT = False
ENCODING = "utf-8"
```

`ENCODING` must be from those supported by Python when opening a file.

## Other information

### Speed

Not measured, but not should be limited in any way but by your memory, disk IO, and CPU speed.

### Stability

A lot of error-handling was done, though there are still a few restrictions, such as:

- If you do not have enough memory to load a file, search cannot be performed because of OS Error.
- In case file uses different encoding, you have to specify it, or it will not work.
- If different files use different encodings, chances are search results will be incomplete.

Moreover, file has to be loaded to RAM completely before searching, which might lead to temporary performance degradation.

## TODO

- [x] Multithreading
- [x] Exception Handling
- [x] Nesting level limitation
- [ ] Logging
- [x] Silent mode
- [x] Launch with terminal parameters
- [ ] Docs
- [ ] Regex
103 changes: 103 additions & 0 deletions content_searcher/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import sys
import threading

# User-defined settings
RECURSION_LIMIT = 1000
LOGGING = False
SILENT = False
ENCODING = "utf-8"

# OS-provided settings
SEPARATOR = os.path.sep


def log(message, end="\n"):
if LOGGING:
pass
if not SILENT:
print(message, end=end, flush=True)


def get_files(d: str) -> list[str]:
files = []
try:
for p in os.listdir(d):
if os.path.isfile(d + SEPARATOR + p):
files.append(d + SEPARATOR + p)
elif os.path.isdir(d + SEPARATOR + p):
for file in get_files(d + SEPARATOR + p):
files.append(file)
except Exception as e:
if isinstance(e, OSError) and str(e)[7:9] == "12":
print(f"Not enough memory for {d}")
else:
print(f"Unknown exception while checking directory {d}: {str(e)}")
return files


class QuickFinder:
def __init__(self, path: str, query: str):
self.result = False
self.path = path
self.query = query

def check_query(self) -> bool:
try:
with open(self.path, "rt", encoding=ENCODING) as file:
if self.query in file.read():
self.result = True
return True
except UnicodeDecodeError:
pass
except Exception as e:
print(
f"Unknown exception while reading file {self.path}: {str(e)} {str(type(e))}"
)
return False


def check_files(files: list[str], query: str) -> list[str]:
threads = []
result = []
log("- Creating threads...", end="\t")
for file in files:
qf = QuickFinder(file, query)
t = threading.Thread(target=qf.check_query, daemon=True)
t.start()
threads.append((qf, t))
log("Done.")
log("- Waiting for threads to finish...", end="\t")
for thread in threads:
thread[1].join()
if thread[0].result:
result.append(thread[0].path)
log("Done.")
return result


def search(path: str, query: str) -> list[str]:
log(f'Getting all files recursively from "{path}"...')
files = get_files(path)
log(f"Done. Found {len(files)} files...")
log(f'Looking for "{query}":')
results = check_files(files, query)
log(f"Done. Found {len(results)} results.", end="\n\n")
return results


if __name__ == "__main__":
sys.setrecursionlimit(RECURSION_LIMIT)
if len(sys.argv) > 2:
path = sys.argv[1]
query = sys.argv[2]
else:
path = input("Path: ")
query = input("Query: ")

# Issue #4 workaround(https://github.com/Formak21/PyFile-Scripts/issues/4)
if "~" in path:
path = os.path.expanduser(path)

r = search(path, query)
print(*r, f"\nTotal: {len(r)}", sep="\n")
48 changes: 40 additions & 8 deletions docs/duplicate_searcher.md → duplicate_finder/README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,56 @@
# Duplicate Searcher

## About
This program allows you to quickly find duplicate files in specified directory recursively. Currently, there are 2 versions available: normal (with recursive function, limited to max of 999 nested directories), and alternative (might be more stable in some edge cases, though it is slower now).

This program allows you to quickly find duplicate files in specified directory recursively. Currently, there are 2 versions available: normal (with recursive function, limited to max of 999 nested directories by default, and alternative (might be more stable in some edge cases, though it is slower now).

## How to use:

### Input
Use any path to the directory, as your OS supports it. Unfortunately, files relative to the user directory (~) in *nix systems are not supported as of now, but you still can specify them relative to your root directory (/) or current folder (.).

Use any path to the directory, as your OS supports it. Files relative to the user directory (~) in \*nix systems are now also supported (only in normal version).

### Output

For each group of duplicates, the program outputs them as following:

```
####################################################################################################
/path/to/duplicate1
...
/path/to/duplicateN
Total: N duplicates
```

### Customizations

You can change the hashing algorithm, chunk size (in bytes, should not be more than available RAM), and recursion limit (maximum nested directories depth) in the following lines (`main.py`, not available in the alternative version):

```python
HASH_FUNCTION = sha1
CHUNK_SIZE = 100 * 1024**2
RECURSION_LIMIT = 1000
```

Please note, that `HASH_FUNCTION` is called from code, so when you change it, do not forget to either import it from a library, or add it to the code.

## Other information
### Speed
No trusted measures yet, but the normal version uses threads to utilize all of your CPU, and reads files in chunks to preserve memory. Please note, that number of threads is limited to a number of files, with no more than 1 thread per file available for stability reasons. However, the program is capable of creating threads for each file, which will be executed in the order your OS believes works best for your computer. We believe reading in chunks provides best average-case time when they are about 100MiB in size, however, if you do not have that much RAM or know the exact number that works best for you, feel free to change their size in the 18 line of _main.py_ (size in bytes, text after "#" is ignored, math operations supported, "**" means raising to some power):
```python3
CHUNK_SIZE = 100 * 1024**2 # 100MiB
```

### Speed

No trusted measures yet, but the normal version uses threads to utilize all of your CPU, and reads files in chunks to preserve memory. Please note, that number of threads is limited to a number of files, with no more than 1 thread per file available for stability reasons. However, the program is capable of creating threads for each file, which will be executed in the order your OS believes works best for your computer. We believe reading in chunks provides best average-case time when they are about 100MiB in size, however, this value can be changed if necessary.

### Stability
A lot of exception-catching is done inside, though beware of files without reading permission: those might and will be marked as duplicates if there are more than 2 of them. Hidden files work in any OS regardless of what name they have. If you have strongly limited RAM, see previous paragraph with information on how to change chunk size and decrease/increase memory usage.

A lot of exception-catching is done inside, though beware of files without reading permission: those might and will be marked as duplicates if there are more than 2 of them. Hidden files work in any OS regardless of what name they have. If you have strongly limited RAM, you can change chunk size to a smaller value.

## TODO

- [x] Multithreading
- [x] Exception Handling
- [x] Nesting level limitation
- [ ] Logging
- [ ] Silent mode
- [x] Additional code comments
- [x] Launch with terminal parameter
- [x] Docs
55 changes: 36 additions & 19 deletions src/duplicate_searcher/main.py → duplicate_finder/main.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
This script finds duplicate files by comparing their hashes.
For more info, see README.md.

# This script is searches file duplicates.
from hashlib import sha512
from os import listdir
from os.path import isfile, isdir
License: MIT
"""

import os
import sys
import threading
from hashlib import sha1
from os import listdir
from os.path import isdir, isfile

CHUNK_SIZE = 100 * 1024 ** 2 # 100MiB
HASH_FUNCTION = sha1 # function to use for hashing files
RECURSION_LIMIT = 1000 # Max recursion level
CHUNK_SIZE = 100 * 1024**2 # 100MiB


class EncoderThread:
Expand All @@ -17,11 +26,12 @@ def __init__(self):
# hash:[filepath1, filepath2, ...]
self.thread_processed_files = dict()

# encodes files with sha512 to check for uniqueness
def sha_encoder(self, file_path: str) -> str:
@staticmethod
def sha_encoder(filepath: str) -> str:
"""Function to encode files with HASH_FUNCTION."""
try:
encoder = sha512()
with open(file=file_path, mode="rb") as file:
encoder = HASH_FUNCTION()
with open(file=filepath, mode="rb") as file:
chunk = file.read(CHUNK_SIZE)
while chunk != b"":
encoder.update(chunk)
Expand All @@ -31,8 +41,8 @@ def sha_encoder(self, file_path: str) -> str:
print(f"Unknown exception: {ex}")
return "-1"

# function that calculates and saves hash values for list of files
def executor(self, files_path: str, unprocessed_files: list[str]) -> None:
"""Function to calculate hashes and save them in dictionary."""
for file in unprocessed_files:
file = f"{files_path}/{file}"
t_hash_key = self.sha_encoder(file)
Expand All @@ -47,9 +57,7 @@ def executor(self, files_path: str, unprocessed_files: list[str]) -> None:


def duplicate_detector(path: str) -> None:
"""
This function finds all duplicates in specified directory recursively.
"""
"""This function finds all duplicates in specified directory recursively."""
directories = []

for element in listdir(path):
Expand All @@ -73,8 +81,8 @@ def duplicate_detector(path: str) -> None:
duplicate_detector(directory)


# function to get dictionaries from all threads
def get_processed_files() -> dict[str, list[str]]:
"""Function to get dictionaries from all threads."""
processed_files = {}
processed_files_keys = set()
for encoder_thread in encoders_list:
Expand All @@ -95,7 +103,16 @@ def get_processed_files() -> dict[str, list[str]]:


if __name__ == "__main__":
root_path = input("Enter path to the root directory: ")
sys.setrecursionlimit(RECURSION_LIMIT)
if len(sys.argv) > 1:
root_path = sys.argv[1]
else:
root_path = input("Enter path to the root directory: ")

# Fix issue #4 (https://github.com/Formak21/PyFile-Scripts/issues/4)
if "~" in root_path:
root_path = os.path.expanduser(root_path)

try:
print("Starting threads...")
duplicate_detector(root_path)
Expand All @@ -104,12 +121,12 @@ def get_processed_files() -> dict[str, list[str]]:
thread.join()
print("Done. Counting duplicate files...")
processed_files = get_processed_files()
for hash_key in processed_files.keys():
if len(processed_files[hash_key]) > 1:
for hash_key, files in processed_files.items():
if len(files) > 1:
print(
"#" * 100,
*processed_files[hash_key],
f"Total: {len(processed_files[hash_key])} duplicates\n",
*files,
f"Total: {len(files)} duplicates\n",
sep="\n",
)
except RecursionError:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import threading

# Parameters
CHUNK_SIZE = 100 * 1024 ** 2 # 100MiB
CHUNK_SIZE = 100 * 1024**2 # 100MiB
ROOT_PATH = "C:/Users/Form49d/Desktop"
EXPORT_FILENAME = "Duplicates.txt"
LOG_FILENAME = "Errors.log"
Expand Down
Loading