From 12efaacfae889a03996296d6da5143bef56c66fa Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 14:50:50 +0900 Subject: [PATCH 1/9] initial commit --- .../ja/ja_kokkai_giji/.gitignore | 7 ++ .../ja/ja_kokkai_giji/README.md | 15 +++ .../ja/ja_kokkai_giji/format.py | 103 ++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/.gitignore create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/README.md create mode 100644 corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/.gitignore b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/.gitignore new file mode 100644 index 00000000..4722719a --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/.gitignore @@ -0,0 +1,7 @@ +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/README.md b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/README.md new file mode 100644 index 00000000..2d1d7110 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/README.md @@ -0,0 +1,15 @@ +# ja-kokkai-giji + +Preprocess text from Kokkai Giji-roku. + +## Environment + +- Python 3.12.5 + +## Usage + +To preprocess text from Kokkai Giji-roku, run the following command: + +```bash +python format.py --input-dir --output-file +``` diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py new file mode 100644 index 00000000..9a527a78 --- /dev/null +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -0,0 +1,103 @@ +import argparse +import dataclasses +import json +import logging +import pathlib +from typing import Optional, Any + + +@dataclasses.dataclass +class SpeechRecord: + speechID: str + speechOrder: int + speaker: str + speakerYomi: Optional[str] + speakerGroup: Optional[str] + speakerPosition: Optional[str] + speakerRole: Optional[str] + speech: str + startPage: int + createTime: str + updateTime: str + speechURL: str + + +@dataclasses.dataclass +class MeetingRecord: + issueID: str + imageKind: str + searchObject: int + session: int + nameOfHouse: str + nameOfMeeting: str + issue: str + date: str + closing: Any + speechRecord: list[SpeechRecord] + meetingURL: str + pdfURL: str + + def __post_init__(self): + self.speechRecord = [SpeechRecord(**speech) for speech in self.speechRecord] + + +@dataclasses.dataclass +class KokkaiGiji: + numberOfRecords: int + numberOfReturn: int + startRecord: int + nextRecordPosition: Optional[int] + meetingRecord: list[MeetingRecord] + + def __post_init__(self): + self.meetingRecord = [MeetingRecord(**meeting) for meeting in self.meetingRecord] + + +def main() -> None: + parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") + parser.add_argument("--input-dir", type=str, required=True, help="Input directory.") + parser.add_argument("--output-file", type=str, required=True, help="Output file.") + parser.add_argument( + "--overwrite", action="store_true", help="Overwrite output file." + ) + args = parser.parse_args() + + input_dir = pathlib.Path(args.input_dir) + file_paths = sorted(input_dir.glob("**/*.json")) + + instances = [] + for file_path in file_paths: + with file_path.open("rt", encoding="utf-8") as f: + dat = KokkaiGiji(**json.load(f)) + + for meeting in dat.meetingRecord: + instance = { + "text": "", + "meta": { + "issueID": meeting.issueID, + "imageKind": meeting.imageKind, + "searchObject": meeting.searchObject, + "session": meeting.session, + "nameOfHouse": meeting.nameOfHouse, + "nameOfMeeting": meeting.nameOfMeeting, + "issue": meeting.issue, + "date": meeting.date, + "closing": meeting.closing, + "meetingURL": meeting.meetingURL, + "pdfURL": meeting.pdfURL, + }, + } + for speech in meeting.speechRecord: + instance["text"] += speech.speech.replace("\r\n", "\n").strip() + "\n\n" + instances.append(instance) + + with open(args.output_file, "wt", encoding="utf-8") as f: + for instance in instances: + f.write(json.dumps(instance, ensure_ascii=False) + "\n") + + +if __name__ == "__main__": + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + ) + main() From dbe7b964cbb96dae048791ac111f4e81d62d6dd2 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 14:51:51 +0900 Subject: [PATCH 2/9] strip the final text --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 1 + 1 file changed, 1 insertion(+) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 9a527a78..3e89df07 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -89,6 +89,7 @@ def main() -> None: } for speech in meeting.speechRecord: instance["text"] += speech.speech.replace("\r\n", "\n").strip() + "\n\n" + instance["text"] = instance["text"].strip() instances.append(instance) with open(args.output_file, "wt", encoding="utf-8") as f: From 1669b925b1bd3664e4becada0141248e1aa3389e Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 15:06:00 +0900 Subject: [PATCH 3/9] tweak --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 3e89df07..379e4510 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -53,6 +53,13 @@ def __post_init__(self): self.meetingRecord = [MeetingRecord(**meeting) for meeting in self.meetingRecord] +def meeting_to_text(meeting: MeetingRecord) -> str: + text = "" + for speech in meeting.speechRecord: + text += speech.speech.replace("\r\n", "\n").strip() + "\n\n" + return text.strip() + + def main() -> None: parser = argparse.ArgumentParser("Remove intra-sentence line breaks from text.") parser.add_argument("--input-dir", type=str, required=True, help="Input directory.") @@ -72,7 +79,7 @@ def main() -> None: for meeting in dat.meetingRecord: instance = { - "text": "", + "text": meeting_to_text(meeting), "meta": { "issueID": meeting.issueID, "imageKind": meeting.imageKind, @@ -87,9 +94,6 @@ def main() -> None: "pdfURL": meeting.pdfURL, }, } - for speech in meeting.speechRecord: - instance["text"] += speech.speech.replace("\r\n", "\n").strip() + "\n\n" - instance["text"] = instance["text"].strip() instances.append(instance) with open(args.output_file, "wt", encoding="utf-8") as f: From 93c553994f072675a74bad122b6411dd741aaaa7 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 15:15:18 +0900 Subject: [PATCH 4/9] tweak --- .../ja/ja_kokkai_giji/format.py | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 379e4510..85360e98 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -72,33 +72,32 @@ def main() -> None: input_dir = pathlib.Path(args.input_dir) file_paths = sorted(input_dir.glob("**/*.json")) + output_file = pathlib.Path(args.output_file) + instances = [] - for file_path in file_paths: - with file_path.open("rt", encoding="utf-8") as f: - dat = KokkaiGiji(**json.load(f)) - - for meeting in dat.meetingRecord: - instance = { - "text": meeting_to_text(meeting), - "meta": { - "issueID": meeting.issueID, - "imageKind": meeting.imageKind, - "searchObject": meeting.searchObject, - "session": meeting.session, - "nameOfHouse": meeting.nameOfHouse, - "nameOfMeeting": meeting.nameOfMeeting, - "issue": meeting.issue, - "date": meeting.date, - "closing": meeting.closing, - "meetingURL": meeting.meetingURL, - "pdfURL": meeting.pdfURL, - }, - } - instances.append(instance) - - with open(args.output_file, "wt", encoding="utf-8") as f: - for instance in instances: - f.write(json.dumps(instance, ensure_ascii=False) + "\n") + with output_file.open("wt", encoding="utf-8") as fout: + for file_path in file_paths: + with file_path.open("rt", encoding="utf-8") as fin: + dat = KokkaiGiji(**json.load(fin)) + + for meeting in dat.meetingRecord: + instance = { + "text": meeting_to_text(meeting), + "meta": { + "issueID": meeting.issueID, + "imageKind": meeting.imageKind, + "searchObject": meeting.searchObject, + "session": meeting.session, + "nameOfHouse": meeting.nameOfHouse, + "nameOfMeeting": meeting.nameOfMeeting, + "issue": meeting.issue, + "date": meeting.date, + "closing": meeting.closing, + "meetingURL": meeting.meetingURL, + "pdfURL": meeting.pdfURL, + }, + } + fout.write(json.dumps(instance, ensure_ascii=False) + "\n") if __name__ == "__main__": From e41731be65960b2a5d1e60e8318ce3581f9b8218 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 15:16:48 +0900 Subject: [PATCH 5/9] tweak --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 85360e98..08765357 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -74,7 +74,6 @@ def main() -> None: output_file = pathlib.Path(args.output_file) - instances = [] with output_file.open("wt", encoding="utf-8") as fout: for file_path in file_paths: with file_path.open("rt", encoding="utf-8") as fin: From 5371be34be9761ba166ca5f8954bbbe453cc6b2f Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 15:23:40 +0900 Subject: [PATCH 6/9] tweak --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 08765357..7d42f1f6 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -75,7 +75,8 @@ def main() -> None: output_file = pathlib.Path(args.output_file) with output_file.open("wt", encoding="utf-8") as fout: - for file_path in file_paths: + for i, file_path in enumerate(file_paths): + logging.info(f"Processing {file_path} ({i + 1}/{len(file_paths)})") with file_path.open("rt", encoding="utf-8") as fin: dat = KokkaiGiji(**json.load(fin)) @@ -97,10 +98,12 @@ def main() -> None: }, } fout.write(json.dumps(instance, ensure_ascii=False) + "\n") + fout.flush() if __name__ == "__main__": logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + level=logging.INFO, ) main() From 1b60da34375885030e59ebae288089f71b537863 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 15:26:23 +0900 Subject: [PATCH 7/9] tweak --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 7d42f1f6..37781c22 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -70,13 +70,13 @@ def main() -> None: args = parser.parse_args() input_dir = pathlib.Path(args.input_dir) - file_paths = sorted(input_dir.glob("**/*.json")) + file_paths = input_dir.glob("**/*.json") output_file = pathlib.Path(args.output_file) with output_file.open("wt", encoding="utf-8") as fout: - for i, file_path in enumerate(file_paths): - logging.info(f"Processing {file_path} ({i + 1}/{len(file_paths)})") + for file_path in file_paths: + logging.info(f"Processing {file_path}") with file_path.open("rt", encoding="utf-8") as fin: dat = KokkaiGiji(**json.load(fin)) From f4ff721cc51af9af760d6165df89e80bb0b4c56a Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 15:27:55 +0900 Subject: [PATCH 8/9] tweak --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 37781c22..5f3e25cb 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -1,3 +1,8 @@ +"""Preprocess the Kokkai Giji corpus. + +Usage: + python format.py --input-dir /path/to/input --output-file /path/to/output +""" import argparse import dataclasses import json From 415e270e932d3e2e702364387395fff11be604c9 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Mon, 27 Jan 2025 17:22:16 +0900 Subject: [PATCH 9/9] tweak --- corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py index 5f3e25cb..e29fd245 100644 --- a/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py +++ b/corpus/llm-jp-corpus-v4/ja/ja_kokkai_giji/format.py @@ -103,7 +103,6 @@ def main() -> None: }, } fout.write(json.dumps(instance, ensure_ascii=False) + "\n") - fout.flush() if __name__ == "__main__":