diff --git a/similarity_engines/sequence.py b/similarity_engines/sequence.py new file mode 100644 index 0000000..c821cae --- /dev/null +++ b/similarity_engines/sequence.py @@ -0,0 +1,18 @@ +from difflib import SequenceMatcher + + +def file_similarity(file1_path, file2_path): + with open(file1_path, "r", encoding="utf-8") as file1: + with open(file2_path, "r", encoding="utf-8") as file2: + file1_content = file1.read() + file2_content = file2.read() + + similarity = SequenceMatcher(None, file1_content, file2_content).ratio() + return similarity * 100 + + +file1_path = "golden.md" +file2_path = "file1.md" + +similarity_percentage = file_similarity(file1_path, file2_path) +print(f"The similarity between the two files is: {similarity_percentage:.2f}%") diff --git a/similarity_engines/similarity.py b/similarity_engines/similarity.py new file mode 100644 index 0000000..cab728a --- /dev/null +++ b/similarity_engines/similarity.py @@ -0,0 +1,59 @@ +from difflib import ndiff + + +def compare_files_take_5(gold_file_path, predicted_file_path, threshold=0.8): + try: + with open(gold_file_path, "r") as gold_file, open( + predicted_file_path, "r" + ) as predicted_file: + gold_content = gold_file.read() + predicted_content = predicted_file.read() + + differences = list(ndiff(gold_content, predicted_content)) + + added_text = "".join([diff[2:] + for diff in differences if diff.startswith("+")]) + deleted_text = "".join( + [diff[2:] for diff in differences if diff.startswith("-")] + ) + + added_length = len(added_text) + deleted_length = len(deleted_text) + + total_length = max(len(gold_content), len(predicted_content)) + + similarity_ratio = 1 - (added_length + deleted_length) / total_length + + is_similar = similarity_ratio >= threshold + + return ( + is_similar, + similarity_ratio, + added_length, + deleted_length, + added_text, + deleted_text, + ) + + except Exception as e: + print(f"Error: {e}") + return False, 0, 0, 0, "", "" + + +gold_standard_file = "golden.md" +predicted_file = "file1" + +is_similar, similarity_ratio, added_length, deleted_length, added_text, deleted_text = ( + compare_files_take_5(gold_standard_file, predicted_file) +) + +print(f"Similarity Ratio: {similarity_ratio:.2%}") +print(f"Is Similar: {is_similar}") +print(f"Added Length: {added_length} characters") +print(f"Deleted Length: {deleted_length} characters") + +# print("\nAdded Text:") +# print(added_text) + +# print("\nDeleted Text:") +# print(deleted_text)