From a7ea67a61ea2db19c393cb3a2c0a5cebb1e1767b Mon Sep 17 00:00:00 2001 From: Akhilesh Negi Date: Mon, 9 Dec 2024 18:45:51 +0530 Subject: [PATCH 1/2] syncing code to github --- similarity_engines/sequence.py | 18 ++++++++++ similarity_engines/similarity.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 similarity_engines/sequence.py create mode 100644 similarity_engines/similarity.py diff --git a/similarity_engines/sequence.py b/similarity_engines/sequence.py new file mode 100644 index 0000000..c821cae --- /dev/null +++ b/similarity_engines/sequence.py @@ -0,0 +1,18 @@ +from difflib import SequenceMatcher + + +def file_similarity(file1_path, file2_path): + with open(file1_path, "r", encoding="utf-8") as file1: + with open(file2_path, "r", encoding="utf-8") as file2: + file1_content = file1.read() + file2_content = file2.read() + + similarity = SequenceMatcher(None, file1_content, file2_content).ratio() + return similarity * 100 + + +file1_path = "golden.md" +file2_path = "file1.md" + +similarity_percentage = file_similarity(file1_path, file2_path) +print(f"The similarity between the two files is: {similarity_percentage:.2f}%") diff --git a/similarity_engines/similarity.py b/similarity_engines/similarity.py new file mode 100644 index 0000000..a528cb7 --- /dev/null +++ b/similarity_engines/similarity.py @@ -0,0 +1,58 @@ +from difflib import ndiff + + +def compare_files_take_5(gold_file_path, predicted_file_path, threshold=0.8): + try: + with open(gold_file_path, "r") as gold_file, open( + predicted_file_path, "r" + ) as predicted_file: + gold_content = gold_file.read() + predicted_content = predicted_file.read() + + differences = list(ndiff(gold_content, predicted_content)) + + added_text = "".join([diff[2:] for diff in differences if diff.startswith("+")]) + deleted_text = "".join( + [diff[2:] for diff in differences if diff.startswith("-")] + ) + + added_length = len(added_text) + deleted_length = len(deleted_text) + + total_length = max(len(gold_content), len(predicted_content)) + + similarity_ratio = 1 - (added_length + deleted_length) / total_length + + is_similar = similarity_ratio >= threshold + + return ( + is_similar, + similarity_ratio, + added_length, + deleted_length, + added_text, + deleted_text, + ) + + except Exception as e: + print(f"Error: {e}") + return False, 0, 0, 0, "", "" + + +gold_standard_file = "golden.md" +predicted_file = "file1" + +is_similar, similarity_ratio, added_length, deleted_length, added_text, deleted_text = ( + compare_files_take_5(gold_standard_file, predicted_file) +) + +print(f"Similarity Ratio: {similarity_ratio:.2%}") +print(f"Is Similar: {is_similar}") +print(f"Added Length: {added_length} characters") +print(f"Deleted Length: {deleted_length} characters") + +# print("\nAdded Text:") +# print(added_text) + +# print("\nDeleted Text:") +# print(deleted_text) From beb82c3778bafba04d3026102c3aed528ac16383 Mon Sep 17 00:00:00 2001 From: Akhilesh Negi Date: Mon, 9 Dec 2024 19:06:59 +0530 Subject: [PATCH 2/2] formatting --- similarity_engines/similarity.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/similarity_engines/similarity.py b/similarity_engines/similarity.py index a528cb7..cab728a 100644 --- a/similarity_engines/similarity.py +++ b/similarity_engines/similarity.py @@ -11,7 +11,8 @@ def compare_files_take_5(gold_file_path, predicted_file_path, threshold=0.8): differences = list(ndiff(gold_content, predicted_content)) - added_text = "".join([diff[2:] for diff in differences if diff.startswith("+")]) + added_text = "".join([diff[2:] + for diff in differences if diff.startswith("+")]) deleted_text = "".join( [diff[2:] for diff in differences if diff.startswith("-")] )