From dfeddbd6728523cb78a879246bd07f02cdbc7ab7 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 12 Sep 2024 18:44:26 +0200 Subject: [PATCH 001/119] Fix typos (most of them found by codespell) Signed-off-by: Stefan Weil --- Makefile | 2 +- src/ocrd/mets_server.py | 2 +- src/ocrd_network/processing_server.py | 2 +- tests/network/test_modules_mets_server_proxy.py | 2 +- tests/test_resolver.py | 2 +- tests/test_resource_manager.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 4997066d1b..70b047c8a9 100644 --- a/Makefile +++ b/Makefile @@ -178,7 +178,7 @@ build: # (Re)install the tool install: #build - # not stricttly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 + # not strictly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 $(PIP) install -U pip wheel $(PIP_INSTALL) . $(PIP_INSTALL_CONFIG_OPTION) @# workaround for shapely#1598 diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0d4c0a0785..c73dbb9b99 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -303,7 +303,7 @@ def add_file( class MpxReq: - """This class wrapps the request bodies needed for the tcp forwarding + """This class wraps the request bodies needed for the tcp forwarding For every mets-server-call like find_files or workspace_path a special request_body is needed to call `MetsServerProxy.forward_tcp_request`. These are created by this functions. diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..a9948ccf80 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -320,7 +320,7 @@ async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict """Forward mets-server-request A processor calls a mets related method like add_file with ClientSideOcrdMets. This sends - a request to this endpoint. This request contains all infomation neccessary to make a call + a request to this endpoint. This request contains all information necessary to make a call to the uds-mets-server. This information is used by `MetsServerProxy` to make a the call to the local (local for the processing-server) reachable the uds-mets-server. """ diff --git a/tests/network/test_modules_mets_server_proxy.py b/tests/network/test_modules_mets_server_proxy.py index 8b8c0d35f7..f19d7e415e 100644 --- a/tests/network/test_modules_mets_server_proxy.py +++ b/tests/network/test_modules_mets_server_proxy.py @@ -119,7 +119,7 @@ def test_find_files(start_uds_mets_server): {"file_grp": test_file_group} ) response_dict = MetsServerProxy().forward_tcp_request(request_body=request_body) - assert len(response_dict["files"]) == 3, "Expected to find exatly 3 matching files" + assert len(response_dict["files"]) == 3, "Expected to find exactly 3 matching files" request_body = MpxReq.find_files( TEST_WORKSPACE_DIR, {"file_grp": test_non_existing_file_group} diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 16dfd03d56..aa0d802926 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -118,7 +118,7 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): """ - Fail with clobber_mets=False, succeeed with clobber_mets=True + Fail with clobber_mets=False, succeed with clobber_mets=True """ # arrange diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 653167e10a..286f6ea6b0 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -80,7 +80,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): assert mgr.userdir == tmp_path -def test_resources_manager_config_explicite(tmp_path): +def test_resources_manager_config_explicit(tmp_path): # act from ocrd.resource_manager import OcrdResourceManager From 5663f4882834fd1430c5c1d55ca438a2406ce9ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:10:12 +0200 Subject: [PATCH 002/119] processor CLI: delegate --resolve-resource, too --- src/ocrd/decorators/__init__.py | 4 +++- src/ocrd/decorators/ocrd_cli_options.py | 1 + src/ocrd/processor/helpers.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 580a75b0c0..7c2dd9717c 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + resolve_resource=None, show_resource=None, list_resources=False, # ocrd_network params start # @@ -50,7 +51,7 @@ def ocrd_cli_wrap_processor( if not sys.argv[1:]: processorClass(None, show_help=True) sys.exit(1) - if dump_json or dump_module_dir or help or version or show_resource or list_resources: + if dump_json or dump_module_dir or help or version or resolve_resource or show_resource or list_resources: processorClass( None, dump_json=dump_json, @@ -58,6 +59,7 @@ def ocrd_cli_wrap_processor( show_help=help, subcommand=subcommand, show_version=version, + resolve_resource=resolve_resource, show_resource=show_resource, list_resources=list_resources ) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a20032..9c87034ab4 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -41,6 +41,7 @@ def cli(mets_url): option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), + option('-R', '--resolve-resource'), option('-C', '--show-resource'), option('-L', '--list-resources', is_flag=True, default=False), option('-J', '--dump-json', is_flag=True, default=False), diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index f5b6010636..921cfeac80 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -290,6 +290,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) ''' information_options = '''\ + -R, --resolve-resource RESNAME Show the full path of processor resource RESNAME -C, --show-resource RESNAME Dump the content of processor resource RESNAME -L, --list-resources List names of processor resources -J, --dump-json Dump tool description as JSON From 853bdb570c861b98debf1c2af60e84f39db47fbf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:12:49 +0200 Subject: [PATCH 003/119] test_mets_server: fix arg vs kwarg --- tests/test_mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 58ff6e2a9b..a313ed5239 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,10 +55,10 @@ def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) workspace_server.add_file( + 'FOO', local_filename=f'local_filename{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{i}', - file_grp='FOO', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' ) From 33c73866e5a289d83354c382b9cc34d7038027cd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:13:46 +0200 Subject: [PATCH 004/119] mets_server: ClientSideOcrdMets needs OcrdMets-like kwargs (without deprecation) --- src/ocrd/mets_server.py | 19 +++++++++---------- tests/test_mets_server.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0d4c0a0785..da6e873c06 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -247,11 +247,9 @@ def add_agent(self, *args, **kwargs): ).json() return OcrdAgentModel.create(**kwargs) - @deprecated_alias(ID="file_id") - @deprecated_alias(pageId="page_id") - @deprecated_alias(fileGrp="file_grp") def find_files(self, **kwargs): self.log.debug("find_files(%s)", kwargs) + # translate from native OcrdMets kwargs to OcrdMetsServer REST params if "pageId" in kwargs: kwargs["page_id"] = kwargs.pop("pageId") if "ID" in kwargs: @@ -277,14 +275,14 @@ def find_files(self, **kwargs): def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) - @deprecated_alias(pageId="page_id") - @deprecated_alias(ID="file_id") def add_file( - self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs + self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs ): data = OcrdFileModel.create( - file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url, - local_filename=local_filename + file_grp=file_grp, + # translate from native OcrdMets kwargs to OcrdMetsServer REST params + file_id=ID, page_id=pageId, + mimetype=mimetype, url=url, local_filename=local_filename ) if not self.multiplexing_mode: @@ -297,8 +295,9 @@ def add_file( raise RuntimeError(f"Add file failed: Msg: {r['error']}") return ClientSideOcrdFile( - None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype, - local_filename=local_filename + None, fileGrp=file_grp, + ID=ID, pageId=pageId, + url=url, mimetype=mimetype, local_filename=local_filename ) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index a313ed5239..1487617a71 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -236,7 +236,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]): assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total' - workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo') + workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo') assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total' From 37f7cda00f53c3f8f01a722c87c2f965dc7c7b68 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:09 +0200 Subject: [PATCH 005/119] use up-to-date kwargs (avoiding old deprecations) --- tests/data/__init__.py | 4 ++-- tests/processor/test_processor.py | 10 +++++----- tests/validator/test_page_validator.py | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 93a2ea49a9..c7fcfb021c 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -52,9 +52,9 @@ def process(self): file_id = make_file_id(input_file, self.output_file_grp) # print(input_file.ID, file_id) self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 784f68fc3d..3a47d2c23f 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -125,8 +125,8 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -135,10 +135,10 @@ def test_run_output0(self): def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') ws.overwrite_mode = True - ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') + ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') ws.overwrite_mode = False with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py index 79e92d90fa..e6aaff1523 100644 --- a/tests/validator/test_page_validator.py +++ b/tests/validator/test_page_validator.py @@ -16,9 +16,10 @@ def test_validate_err(self): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') + with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'): + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first') + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first') def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) @@ -44,7 +45,7 @@ def test_validate_lax(self): report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') - report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') + report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax') def test_validate_multi_textequiv_first(self): @@ -89,7 +90,7 @@ def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') - PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') + PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors') From 44946baa17d1c44d9896ef35103a97e2f48a6d2a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:59 +0200 Subject: [PATCH 006/119] hide/test expected deprecation warnings --- tests/test_resolver.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 16dfd03d56..c2575b6086 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -292,20 +292,21 @@ def test_resolve_mets_arguments(): https://github.com/OCR-D/core/issues/517 """ resolver = Resolver() - assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) - assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) - assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) - with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): - resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) - with pytest.raises(ValueError, match="inconsistent with --directory"): - resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) - with pytest.warns(DeprecationWarning): - resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) - with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): - resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning, match='--mets-basename'): + assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) + assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) + assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) + with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): + resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) + with pytest.raises(ValueError, match="inconsistent with --directory"): + resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning): + resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) + with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): + resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) if __name__ == '__main__': main(__file__) From d0962d67ee2e5da332ff0385e417925ab1581481 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:53:25 +0200 Subject: [PATCH 007/119] improve output in case of assertion failures --- tests/cli/test_validate.py | 22 ++++++++++----------- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 36ee3e5995..bf74a84c59 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, err) # relative path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, err) # default path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,11 +84,11 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, _ = self.invoke_cli(validate_cli, ['tasks', + code, _, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), @@ -96,7 +96,7 @@ def test_validate_tasks(self): "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) if __name__ == '__main__': diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 3ad40d8645..6d4616c2db 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 061f0231a148f09943d1c5ee35f456ad502f2755 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:34:43 +0200 Subject: [PATCH 008/119] allow "from ocrd_models import OcrdPage --- src/ocrd_models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..330fefe97d 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport From d2f92d1e4814d810d10b5d31a63f730568c11e29 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:13:58 +0200 Subject: [PATCH 009/119] ocrd_utils: forgot to export scale_coordinates at toplvl --- src/ocrd_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index b5bbcae121..836f01dce4 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -13,6 +13,7 @@ :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_coordinates`, + :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, :py:func:`transform_coordinates` @@ -148,6 +149,7 @@ polygon_mask, rotate_coordinates, rotate_image, + scale_coordinates, shift_coordinates, transform_coordinates, transpose_coordinates, From c6c5c42a1d37478a6c8a4c43b5fd61c69249f7b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:19 +0200 Subject: [PATCH 010/119] fix imports --- src/ocrd/decorators/parameter_option.py | 2 +- src/ocrd/workspace.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 0fbe3e0577..55abbc2a53 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -1,10 +1,10 @@ from click import option -#from ocrd_utils import parse_json_string_or_file __all__ = ['parameter_option', 'parameter_override_option'] def _handle_param_option(ctx, param, value): + from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) parameter_option = option('-p', '--parameter', diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index ff856011be..b4795f3e89 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -24,6 +24,7 @@ coordinates_of_segment, adjust_canvas_to_rotation, adjust_canvas_to_transposition, + scale_coordinates, shift_coordinates, rotate_coordinates, transform_coordinates, From 245778c74a373c07a007d5deb982197d0b22d569 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:05:24 +0200 Subject: [PATCH 011/119] Processor.zip_input_files: warning instead of exception for missing input files --- src/ocrd/processor/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8303413933..5113faf3da 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -377,16 +377,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: @@ -431,13 +424,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + LOG.critical(f"Could not find any files for selected pageId {self.page_id}") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + LOG.error(f'Found no page {page} in file group {ifg}') if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts From 1f7b57fc70fe26cb5399db54edb4a4748184327d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:05:38 +0200 Subject: [PATCH 012/119] Processor.zip_input_files: more verbose log msg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5113faf3da..9e5f5aead6 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -426,7 +426,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): - LOG.critical(f"Could not find any files for selected pageId {self.page_id}") + LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 35bdb39773dd26d238d00c00f9d3f7c2c711ac4a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:29 +0200 Subject: [PATCH 013/119] tests report.is_valid: improve output on failure --- tests/cli/test_validate.py | 23 +++++++++---------- tests/validator/test_json_validator.py | 6 ++--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- tests/validator/test_parameter_validator.py | 2 +- .../validator/test_resource_list_validator.py | 3 +-- tests/validator/test_xsd_validator.py | 8 +++---- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index bf74a84c59..cc58df6540 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, out + err) # relative path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, out + err) # default path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, out + err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, out + err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,19 +84,18 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, err = self.invoke_cli(validate_cli, ['tasks', + code, out, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) - print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) if __name__ == '__main__': diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 8a8387d4b6..bd756879bc 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -20,18 +20,18 @@ def setUp(self): def test_validate_string(self): report = JsonValidator.validate('{}', {}) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_defaults_set(self): obj = {'bar': 2000} report = self.defaults_validator._validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'foo': 3000, 'bar': 2000}) def test_properr(self): obj = {'bar': 100, 'quux': {}} report = self.defaults_validator._validate(obj) - self.assertFalse(report.is_valid) + self.assertFalse(report.is_valid, str(report.to_xml())) self.assertEqual(len(report.errors), 1) diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 6d4616c2db..70d40c2f2a 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.to_xml())) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f0d9d41d2c..297a149064 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -42,7 +42,7 @@ def test_default_assignment(self): }) obj = {'baz': '23'} report = validator.validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'baz': '23', "num-param": 1}) def test_min_max(): diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index eb95d9b1ea..cc63c30ea7 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -22,8 +22,7 @@ def reslist(): def test_resource_list_validator(reslist): report = OcrdResourceListValidator.validate(reslist) - print(report.errors) - assert report.is_valid == True + assert report.is_valid, str(report.to_xml()) if __name__ == '__main__': main(__file__) diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py index d0150338dd..50b3851ffc 100644 --- a/tests/validator/test_xsd_validator.py +++ b/tests/validator/test_xsd_validator.py @@ -37,22 +37,22 @@ def test_mets_empty(self): def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) class TestXsdPageValidator(TestCase): def test_validate_page_simple_static_doc(self): report = XsdPageValidator.validate(simple_page) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) if __name__ == '__main__': main(__file__) From e595996d91ae05577cbd3bc133c2f2429d462ff2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:49:08 +0200 Subject: [PATCH 014/119] fix --log-filename (6fc606027a): apply in ocrd_cli_wrap_processor --- src/ocrd/decorators/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 7c2dd9717c..464bb67ed8 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ import sys +from contextlib import nullcontext from ocrd_utils import ( config, @@ -9,6 +10,7 @@ parse_json_string_with_comments, set_json_key_value_overrides, parse_json_string_or_file, + redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator from ocrd_network import ProcessingWorker, ProcessorServer, AgentType @@ -141,7 +143,7 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() - def exit(): + def goexit(): pr.disable() print("Profiling completed") if profile_file: @@ -150,8 +152,13 @@ def exit(): s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + atexit.register(goexit) + if log_filename: + log_ctx = redirect_stderr_and_stdout_to_file(log_filename) + else: + log_ctx = nullcontext() + with log_ctx: + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): From f21b8d24eaa8320b2ff1c405355ce0b40f116256 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:54:07 +0200 Subject: [PATCH 015/119] fix exception --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 44bbd081bc..e63c5fd015 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -248,7 +248,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) + log.warning("Cannot unwrap Google Drive URL: %s", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() From 0cbd3ea906e8c93f940e012f3f7383a1a372c135 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:27:33 +0200 Subject: [PATCH 016/119] adapt to PIL.Image moved constants --- src/ocrd/workspace.py | 8 +++---- src/ocrd_utils/image.py | 50 ++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index b4795f3e89..8b8e89bfca 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1151,9 +1151,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { - 90: Image.ROTATE_90, - 180: Image.ROTATE_180, - 270: Image.ROTATE_270 + 90: Image.Transpose.ROTATE_90, + 180: Image.Transpose.ROTATE_180, + 270: Image.Transpose.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, @@ -1221,5 +1221,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_image = segment_image.resize((int(segment_image.width * factor), int(segment_image.height * factor)), # slowest, but highest quality: - Image.BICUBIC) + Image.Resampling.BICUBIC) return segment_image, segment_coords, segment_xywh diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 3bc14e6612..6f2524608c 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method): Return a numpy array of the enlarged width and height. """ - if method in [Image.ROTATE_90, - Image.ROTATE_270, - Image.TRANSPOSE, - Image.TRANSVERSE]: + if method in [Image.Transpose.ROTATE_90, + Image.Transpose.ROTATE_270, + Image.Transpose.TRANSPOSE, + Image.Transpose.TRANSVERSE]: size = size[::-1] return size @@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: entails translation to the center, followed by pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: entails translation to the center, followed by pure reflection about the origin, and subsequent translation back - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: entails translation to the center, followed by pure rotation by 90° counter-clockwise, and subsequent translation back - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: entails translation to the center, followed by pure rotation by 270° counter-clockwise, and subsequent translation back - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the y-axis, and subsequent translation back @@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): [0, 0, 1]]) transform = shift_coordinates(transform, -orig) operations = { - Image.FLIP_LEFT_RIGHT: [refly], - Image.FLIP_TOP_BOTTOM: [reflx], - Image.ROTATE_180: [reflx, refly], - Image.ROTATE_90: [rot90], - Image.ROTATE_270: [rot90, reflx, refly], - Image.TRANSPOSE: [rot90, reflx], - Image.TRANSVERSE: [rot90, refly] + Image.Transpose.FLIP_LEFT_RIGHT: [refly], + Image.Transpose.FLIP_TOP_BOTTOM: [reflx], + Image.Transpose.ROTATE_180: [reflx, refly], + Image.Transpose.ROTATE_90: [rot90], + Image.Transpose.ROTATE_270: [rot90, reflx, refly], + Image.Transpose.TRANSPOSE: [rot90, reflx], + Image.Transpose.TRANSVERSE: [rot90, refly] }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) @@ -411,29 +411,29 @@ def transpose_image(image, method): Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: all pixels get mirrored at half the height of the image - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: all pixels get mirrored at both, the width and half the height of the image, i.e. the image gets rotated by 180° counter-clockwise - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: rows become columns (but counted from the right) and columns become rows, i.e. the image gets rotated by 90° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: rows become columns and columns become rows (but counted from the bottom), i.e. the image gets rotated by 270° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: rows become columns and vice versa, i.e. all pixels get mirrored at the main diagonal; width becomes height and vice versa - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: rows become columns (but counted from the right) and columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; From 8f8912c14dcccdc485d03e94efe33d9097fcdb78 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:31:35 +0200 Subject: [PATCH 017/119] cli.workspace: pass fileGrp as well, improve description --- src/ocrd/cli/workspace.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a36..062a373608 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -118,7 +118,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @@ -129,8 +129,10 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim Create a workspace from METS_URL and return the directory METS_URL can be a URL, an absolute path or a path relative to $PWD. - If METS_URL is not provided, use --mets accordingly. METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. + + Additional options pertain to the selection of files / fileGrps / pages + to be downloaded, if --download is used. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: @@ -143,6 +145,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, @@ -407,7 +410,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() From 6dccfb388209a7e14b61a46e139ad07e72926c3f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:35:37 +0200 Subject: [PATCH 018/119] OcrdMets.add_agent: does not have positional args --- src/ocrd/mets_server.py | 2 +- src/ocrd_models/ocrd_mets.py | 4 ++-- tests/model/test_ocrd_mets.py | 2 +- tests/test_workspace.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index da6e873c06..7c22da278d 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -236,7 +236,7 @@ def agents(self): agent_dict["_type"] = agent_dict.pop("type") return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts] - def add_agent(self, *args, **kwargs): + def add_agent(self, **kwargs): if not self.multiplexing_mode: return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cda..66251a54dc 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -198,7 +198,7 @@ def agents(self) -> List[OcrdAgent]: """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - def add_agent(self, *args, **kwargs) -> OcrdAgent: + def add_agent(self, **kwargs) -> OcrdAgent: """ Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. """ @@ -213,7 +213,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - return OcrdAgent(el_agent, *args, **kwargs) + return OcrdAgent(el_agent, **kwargs) @property def file_groups(self) -> List[str]: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 739db7625a..89742a507e 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01): def test_agent(sbb_sample_01): beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL') assert len(sbb_sample_01.agents) == beforelen + 1 def test_metshdr(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index c8df9b444b..75e9b6886f 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -734,7 +734,7 @@ def _fixture_metsDocumentID(tmp_path): def test_agent_before_metsDocumentID(workspace_metsDocumentID): report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) assert report.is_valid - workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER') + workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER') workspace_metsDocumentID.save_mets() report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) print(report.errors) From 2d85f14d00bd112553e6ee4a0751436e8d1131f7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:15:13 +0200 Subject: [PATCH 019/119] update pylintrc --- .pylintrc | 18 ++++++++---------- src/ocrd/resource_manager.py | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.pylintrc b/.pylintrc index b2125d824c..a4106a1bb7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,19 +1,21 @@ [MASTER] -extension-pkg-whitelist=lxml -ignored-modules=cv2,tesserocr,ocrd.model +extension-pkg-whitelist=lxml,pydantic +ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-patterns=.*generateds.* [MESSAGES CONTROL] -ignore-patterns='.*generateds.*' disable = fixme, - E501, + line-too-long, + consider-using-f-string, + logging-fstring-interpolation, trailing-whitespace, logging-not-lazy, inconsistent-return-statements, + disallowed-name, invalid-name, line-too-long, missing-docstring, - no-self-use, wrong-import-order, too-many-nested-blocks, superfluous-parens, @@ -25,13 +27,9 @@ disable = ungrouped-imports, useless-object-inheritance, useless-import-alias, - bad-continuation, no-else-return, logging-not-lazy -[FORMAT] -no-space-check=empty-line - [DESIGN] # Maximum number of arguments for function / method max-args=12 @@ -40,7 +38,7 @@ max-locals=30 # Maximum number of return / yield for function / method body max-returns=12 # Maximum number of branch for function / method body -max-branchs=30 +max-branches=30 # Maximum number of statements in function / method body max-statements=60 # Maximum number of parents for a class (see R0901). diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e63c5fd015..1fc0409250 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -13,12 +13,16 @@ from gdown.download import get_url_from_gdrive_confirmation from yaml import safe_load, safe_dump +# pylint: disable=wrong-import-position + # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \ yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str'] +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json From ea68370e223a7b8af2843ca16c0ebd8f223b6574 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:18:53 +0200 Subject: [PATCH 020/119] pylint: try ignoring generateds (again) --- .pylintrc | 1 + src/ocrd/cli/ocrd_tool.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index a4106a1bb7..2e3af4288b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,7 @@ [MASTER] extension-pkg-whitelist=lxml,pydantic ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-paths=ocrd_page_generateds.py ignore-patterns=.*generateds.* [MESSAGES CONTROL] diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 2a7fa99ec9..3c024ec668 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -29,6 +29,8 @@ def __init__(self, filename): self.filename = filename with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() + # perhaps the validator should _always_ run (for default expansion) + # so validate command only for the report? self.json = loads(self.content) pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) From 18ac2c0ab954268811a2ed8654cafc44924e01a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:11:49 +0200 Subject: [PATCH 021/119] ClientSideOcrdMets: use same logger name prefix as server --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 7c22da278d..9b66871349 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -120,7 +120,7 @@ class ClientSideOcrdMets: def __init__(self, url, workspace_path: Optional[str] = None): self.protocol = "tcp" if url.startswith("http://") else "uds" - self.log = getLogger(f"ocrd.mets_client[{url}]") + self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}") self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}' self.ws_dir_path = workspace_path if workspace_path else None From da37967357f4d1bf9076498342319fddc35db070 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:15:03 +0200 Subject: [PATCH 022/119] test_mets_server: use tmpdir to avoid side effects between suites --- tests/test_mets_server.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 1487617a71..8f94b95645 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,13 +22,16 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel -WORKSPACE_DIR = '/tmp/ocrd-mets-server' TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] +initLogging() +setOverrideLogLevel(10) + @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: +def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: + tmpdir = str(tmpdir) def _start_mets_server(*args, **kwargs): mets_server = OcrdMetsServer(*args, **kwargs) mets_server.startup() @@ -39,21 +42,22 @@ def _start_mets_server(*args, **kwargs): if exists(mets_server_url): remove(mets_server_url) - if exists(WORKSPACE_DIR): - rmtree(WORKSPACE_DIR, ignore_errors=True) + if exists(tmpdir): + rmtree(tmpdir, ignore_errors=True) - copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR) - workspace = Workspace(Resolver(), WORKSPACE_DIR) + copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) + workspace = Workspace(Resolver(), tmpdir) p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) p.start() sleep(1) # sleep to start up server - yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) + yield mets_server_url, workspace_server p.terminate() - rmtree(WORKSPACE_DIR, ignore_errors=True) + rmtree(tmpdir, ignore_errors=True) def add_file_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( 'FOO', local_filename=f'local_filename{i}', @@ -64,8 +68,8 @@ def add_file_server(x): ) def add_agent_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.mets.add_agent( name=f'proc{i}', _type='baz', @@ -82,7 +86,10 @@ def test_mets_server_add_file(start_mets_server): # add NO_FILES files in parallel with Pool() as pool: - pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES))) + pool.map(add_file_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + range(NO_FILES))) assert set(workspace_server.mets.file_groups) == set( [ 'OCR-D-IMG', @@ -107,7 +114,7 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES # not yet synced - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0 # sync @@ -125,13 +132,16 @@ def test_mets_server_add_agents(start_mets_server): # add NO_AGENTS agents in parallel with Pool() as pool: - pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS)))) + pool.map(add_agent_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + list(range(NO_AGENTS)))) assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before # XXX not a tuple assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'} - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.agents) == no_agents_before # sync @@ -142,7 +152,7 @@ def test_mets_server_add_agents(start_mets_server): def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server - workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) assert str(f) == '' a = workspace_server.mets.agents[0] @@ -182,7 +192,7 @@ def test_mets_server_socket_stop(start_mets_server): assert True, 'No stop conditions to test for TCP server' else: assert Path(mets_server_url).exists() - assert workspace_server.mets.workspace_path == WORKSPACE_DIR + assert workspace_server.mets.workspace_path == workspace_server.directory workspace_server.mets.stop() with raises(ConnectionError): workspace_server.mets.file_groups From ccb416b13e7f91781568fda8e60ad8182bfea88c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:04:04 +0200 Subject: [PATCH 023/119] disableLogging: re-instate root logger, to --- src/ocrd_utils/logging.py | 4 +++- tests/test_decorators.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index bb771fc0ce..8f45f9c7fc 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -212,11 +212,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: + for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: logging.getLogger(logger_name).setLevel(logging.NOTSET) + # Python default log level is WARNING + logging.root.setLevel(logging.WARNING) # Initializing stream handlers at module level # would cause message output in all runtime contexts, diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 5ab2880053..df8d6422be 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -64,6 +64,7 @@ def test_loglevel_override(self): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() + assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO From 7e3cdf4ec014efe5b4cddb8d9554981f9181a6d5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:15:56 +0200 Subject: [PATCH 024/119] test-logging: also remove ocrd.log from tempdir --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4997066d1b..b5cd2f276e 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,7 @@ test-logging: assets cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \ cd $$tempdir; \ $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \ - rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \ + rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \ rm -rf $$tempdir/.coverage; \ rmdir $$tempdir From 4f45b12027fb0d53301dbbf17e2dcfa5637a1497 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:34 +0200 Subject: [PATCH 025/119] bashlib: re-add --log-filename, implement as stderr redirect --- src/ocrd/lib.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 1e3ecfc6eb..febaf92ae6 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -141,6 +141,7 @@ ocrd__parse_argv () { while [[ "${1:-}" = -* ]];do case "$1" in -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; + --log-filename) exec 2> "$2" ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; From 7b70c90957bd8fe4ccfa78328ff860cff69cc87b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:01 +0200 Subject: [PATCH 026/119] ocrd_utils.config: add reset_defaults() --- src/ocrd_utils/config.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 063af930c8..4182456435 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -68,14 +68,26 @@ def has_default(self, name): raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default + def reset_defaults(self): + for name in self._variables: + try: + # we cannot use hasattr, because that delegates to getattr, + # which we override and provide defaults for (which of course + # cannot be removed) + if self.__getattribute__(name): + delattr(self, name) + except AttributeError: + pass + def describe(self, name, *args, **kwargs): if not name in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): + # will be called if name is not accessible (has not been added directly yet) if not name in self._variables: - raise ValueError(f"Unregistered env variable {name}") + raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: raw_value = self.raw_value(name) From 48bb3c2316e6838ff235a2badc985da14ee8b1b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:31 +0200 Subject: [PATCH 027/119] add test for OcrdEnvConfig.reset_defaults() --- tests/utils/test_config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 99595a864c..a94eb5d3cc 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -57,3 +57,11 @@ def test_OCRD_PROFILE(): with temp_env_var('OCRD_PROFILE', 'some other value'): with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"): config.OCRD_PROFILE + +def test_defaults(): + default = config.OCRD_MAX_PROCESSOR_CACHE + print(type(default)) + config.OCRD_MAX_PROCESSOR_CACHE = 2 + assert config.OCRD_MAX_PROCESSOR_CACHE == 2 + config.reset_defaults() + assert config.OCRD_MAX_PROCESSOR_CACHE == default From ed924032cc959c15f5f6fdd5a2cb34efa4d925a6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:14:13 +0200 Subject: [PATCH 028/119] Workspace.reload_mets: fix for METS server case --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 8b8e89bfca..4ef59252a0 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -123,7 +123,10 @@ def reload_mets(self): """ Reload METS from the filesystem. """ - self.mets = OcrdMets(filename=self.mets_target) + if self.is_remote: + self.mets.reload() + else: + self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") From 9c3c3997b5039ca68192d7046808aa5d1cfb83cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 14:59:42 +0200 Subject: [PATCH 029/119] OcrdMetsServer.add_file: pass on 'force' kwarg, too --- src/ocrd/mets_server.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 9b66871349..8a18f01682 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -284,15 +284,17 @@ def add_file( file_id=ID, page_id=pageId, mimetype=mimetype, url=url, local_filename=local_filename ) + # add force+ignore + kwargs = {**kwargs, **data.dict()} if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) - if not r: - raise RuntimeError("Add file failed. Please check provided parameters") + r = self.session.request("POST", f"{self.url}/file", data=kwargs) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) - if "error" in r: - raise RuntimeError(f"Add file failed: Msg: {r['error']}") + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs)) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}") return ClientSideOcrdFile( None, fileGrp=file_grp, @@ -506,7 +508,8 @@ async def add_file( page_id: Optional[str] = Form(), mimetype: str = Form(), url: Optional[str] = Form(None), - local_filename: Optional[str] = Form(None) + local_filename: Optional[str] = Form(None), + force: bool = Form(False), ): """ Add a file @@ -518,7 +521,7 @@ async def add_file( ) # Add to workspace kwargs = file_resource.dict() - workspace.add_file(**kwargs) + workspace.add_file(**kwargs, force=force) return file_resource # ------------- # From c077e957f256c21ec46c2b18cf5881e815a55fac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:00:38 +0200 Subject: [PATCH 030/119] test_mets_server: add test for force (overwrite) --- tests/test_mets_server.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 8f94b95645..dc94d6c560 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,7 +55,7 @@ def _start_mets_server(*args, **kwargs): p.terminate() rmtree(tmpdir, ignore_errors=True) -def add_file_server(x): +def add_file_server(x, force=False): mets_server_url, directory, i = x workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( @@ -65,6 +65,7 @@ def add_file_server(x): page_id=f'page{i}', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' + force=force ) def add_agent_server(x): @@ -123,6 +124,19 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES +def test_mets_server_add_file_overwrite(start_mets_server): + mets_server_url, workspace_server = start_mets_server + + add_file_server((mets_server_url, workspace_server.directory, 5)) + + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + + with raises(RuntimeError, match="already exists"): + add_file_server((mets_server_url, workspace_server.directory, 5)) + + add_file_server((mets_server_url, workspace_server.directory, 5), force=True) + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + def test_mets_server_add_agents(start_mets_server): NO_AGENTS = 30 From 4492168ddabaf835b70c91602f905469c4ce6f3d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:59:51 +0200 Subject: [PATCH 031/119] PcGts.Page.id / make_xml_id: replace '/' with '_' --- src/ocrd_utils/str.py | 3 ++- tests/model/test_ocrd_page.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index dea3715bf4..b3d3ef496f 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -105,10 +105,11 @@ def make_xml_id(idstr: str) -> str: ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') + ret = ret.replace('/', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7dc130809f..97335775d6 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -460,7 +460,7 @@ def test_id(): # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName - assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif' if __name__ == '__main__': From 83d52d888a4d403c3ce35a7db50c90db83253f7e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 16:32:55 +0200 Subject: [PATCH 032/119] METS Server: also export+delegate physical_pages --- src/ocrd/mets_server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 8a18f01682..c85368e305 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -88,6 +88,14 @@ def create(file_groups: List[str]): return OcrdFileGroupListModel(file_groups=file_groups) +class OcrdPageListModel(BaseModel): + physical_pages: List[str] = Field() + + @staticmethod + def create(physical_pages: List[str]): + return OcrdPageListModel(physical_pages=physical_pages) + + class OcrdAgentListModel(BaseModel): agents: List[OcrdAgentModel] = Field() @@ -210,6 +218,17 @@ def workspace_path(self): ).json()["text"] return self.ws_dir_path + @property + def physical_pages(self) -> List[str]: + if not self.multiplexing_mode: + return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"] + else: + return self.session.request( + "POST", + self.url, + json=MpxReq.physical_pages(self.ws_dir_path) + ).json()["physical_pages"] + @property def file_groups(self): if not self.multiplexing_mode: @@ -349,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={}) + @staticmethod + def physical_pages(ws_dir_path: str) -> Dict: + return MpxReq.__args_wrapper( + ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={}) + @staticmethod def file_groups(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( @@ -469,6 +493,10 @@ async def unique_identifier(): async def workspace_path(): return Response(content=workspace.directory, media_type="text/plain") + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + return {'physical_pages': workspace.mets.physical_pages} + @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): return {'file_groups': workspace.mets.file_groups} From 4eccefc43b39e26337d0542e633fda077097d079 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:08 +0200 Subject: [PATCH 033/119] ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) --- src/ocrd/cli/workspace.py | 87 ++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 062a373608..6add3f839f 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -37,6 +37,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup + def workspace(self): + return Workspace( + self.resolver, + directory=self.directory, + mets_basename=self.mets_basename, + automatic_backup=self.automatic_backup, + mets_server_url=self.mets_server_url, + ) + def backup_manager(self): + return WorkspaceBackupManager(self.workspace()) + pass_workspace = click.make_pass_decorator(WorkspaceCtx) @@ -139,6 +150,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -174,10 +186,11 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, - clobber_mets=clobber_mets + clobber_mets=clobber_mets, ) workspace.save_mets() print(workspace.directory) @@ -201,13 +214,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() log = getLogger('ocrd.cli.workspace.add') if not mimetype: @@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() try: pat = re.compile(regex) @@ -454,13 +455,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False - ret = list() - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + ret = [] + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets() @@ -528,7 +524,7 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() @@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob): If no PATH_GLOB are specified, then all files and directories may match. """ - log = getLogger('ocrd.cli.workspace.clean') - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)] allowed_files.append(relpath(workspace.mets_target, start=workspace.directory)) allowed_dirs = set(dirname(path) for path in allowed_files) @@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_files: continue if dry_run: - log.info('unlink(%s)' % path) + ctx.log.info('unlink(%s)' % path) else: unlink(path) if not directories: @@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_dirs: continue if dry_run: - log.info('rmdir(%s)' % path) + ctx.log.info('rmdir(%s)' % path) else: rmdir(path) @@ -651,7 +646,7 @@ def list_groups(ctx): """ List fileGrp USE attributes """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) # ---------------------------------------------------------------------- @@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() find_kwargs = {} if page_id_range and 'ID' in output_field: find_kwargs['pageId'] = page_id_range @@ -724,7 +719,7 @@ def get_id(ctx): """ Get METS id if any """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() ID = workspace.mets.unique_identifier if ID: print(ID) @@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin Otherwise will create a new {{ ID }}. """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.unique_identifier = id workspace.save_mets() @@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: @@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, @@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa # ---------------------------------------------------------------------- @workspace_cli.group('backup') -@click.pass_context +@pass_workspace def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ + assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" @workspace_backup_cli.command('add') @pass_workspace @@ -841,7 +837,7 @@ def workspace_backup_add(ctx): """ Create a new backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.add() @workspace_backup_cli.command('list') @@ -850,7 +846,7 @@ def workspace_backup_list(ctx): """ List backups """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() for b in backup_manager.list(): print(b) @@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) @workspace_backup_cli.command('undo') @@ -871,7 +867,7 @@ def workspace_backup_undo(ctx): """ Restore the last backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.undo() @@ -888,13 +884,8 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument @workspace_serve_cli.command('stop') @pass_workspace def workspace_serve_stop(ctx): # pylint: disable=unused-argument - """Stop the METS server""" - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + """Stop the METS server (saving changes to disk)""" + workspace = ctx.workspace() workspace.mets.stop() @workspace_serve_cli.command('start') From 083df27664f4a40eb2d2baddcbb6bf0fd214df5d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:32 +0200 Subject: [PATCH 034/119] ocrd.cli.workspace server: add 'reload' and 'save' --- src/ocrd/cli/workspace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 6add3f839f..ff4aeef7c5 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -888,6 +888,20 @@ def workspace_serve_stop(ctx): # pylint: disable=unused-argument workspace = ctx.workspace() workspace.mets.stop() +@workspace_serve_cli.command('reload') +@pass_workspace +def workspace_serve_reload(ctx): # pylint: disable=unused-argument + """Reload the METS server from disk""" + workspace = ctx.workspace() + workspace.mets.reload() + +@workspace_serve_cli.command('save') +@pass_workspace +def workspace_serve_save(ctx): # pylint: disable=unused-argument + """Save the METS changes to disk""" + workspace = ctx.workspace() + workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace def workspace_serve_start(ctx): # pylint: disable=unused-argument From b2c01610bffd277ef7a3345427ff016280efc3a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:36:03 +0200 Subject: [PATCH 035/119] ocrd.cli.validate tasks: pass on --mets-server-url, too --- src/ocrd/cli/validate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d053..9d0cafd064 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -102,16 +102,19 @@ def validate_page(page, **kwargs): @validate_cli.command('tasks') @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace, mets_basename, overwrite, page_id): +def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): ''' Validate a sequence of tasks passable to 'ocrd process' ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], - Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite)) + _inform_of_result(validate_tasks( + [ProcessorTask.parse(t) for t in tasks], + Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url), + page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) From 203a06a2a36ac5a74a5ab73ba9c693902e89fc38 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:47:14 +0200 Subject: [PATCH 036/119] run_processor: be robust if ocrd_tool is missing steps --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 921cfeac80..fb5ca1bb0f 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -98,7 +98,7 @@ def run_processor( ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) - otherrole = ocrd_tool['steps'][0] + otherrole = ocrd_tool.get('steps', [''])[0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() From 4fbdd00439b9121dd5f01dd6b4ba2d5f24c251ae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:38:11 +0200 Subject: [PATCH 037/119] lib.bash: fix errexit --- src/ocrd/lib.bash | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index febaf92ae6..745bc52fe4 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,6 +27,7 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { + set -e local minversion="$1" local version=$(ocrd --version|sed 's/ocrd, version //') #echo "$minversion < $version?" @@ -108,6 +109,7 @@ ocrd__usage () { ## declare -A ocrd__argv=() ## ``` ocrd__parse_argv () { + set -e # if [[ -n "$ZSH_VERSION" ]];then # print -r -- ${+ocrd__argv} ${(t)ocrd__argv} @@ -250,6 +252,7 @@ $params_parsed" } ocrd__wrap () { + set -e declare -gx OCRD_TOOL_JSON="$1" declare -gx OCRD_TOOL_NAME="$2" From c86507951e85ab13412cb6264841272f809ba07e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:03:43 +0200 Subject: [PATCH 038/119] tests: make sure ocrd_utils.config gets reset whenever changing it globally --- tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++-- tests/test_decorators.py | 6 +++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 3a47d2c23f..f2261d0ffb 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -6,8 +6,9 @@ from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.test_mets_server import fixture_start_mets_server -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config from ocrd.resolver import Resolver from ocrd.processor.base import Processor, run_processor, run_cli @@ -28,6 +29,10 @@ def setUp(self): self.workspace = self.resolver.workspace_from_url('mets.xml') self.addCleanup(stack.pop_all().close) + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_incomplete_processor(self): proc = IncompleteProcessor(None) with self.assertRaises(NotImplementedError): @@ -242,7 +247,29 @@ class ZipTestProcessor(Processor): pass proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err + +def test_run_output_metsserver(start_mets_server): + mets_server_url, ws = start_mets_server + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + ws.overwrite_mode = True + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + ws.overwrite_mode = False + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert "already exists" in str(exc.value) + if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index df8d6422be..c36577020a 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config @click.command() @ocrd_cli_options @@ -45,6 +45,10 @@ def setUp(self): super().setUp() disableLogging() + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_minimal(self): exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) print(out, err) From 1a13cd394fd7f8a0a12259f7aefc0c3e1b1c8acc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:55:41 +0200 Subject: [PATCH 039/119] ocrd.cli.workspace: assert non-server in cmds mutating METS --- src/ocrd/cli/workspace.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ff4aeef7c5..415b8e6e2f 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -150,7 +150,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -186,7 +187,8 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, @@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) @@ -524,6 +528,8 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) @@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( @@ -762,6 +772,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() @@ -800,6 +812,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( From bba597e1d5d4fe72044fb1024de548906cd599d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:37 +0200 Subject: [PATCH 040/119] OcrdPage: add PageType.get_ReadingOrderGroups() --- src/ocrd_page_user_methods.py | 1 + .../get_ReadingOrderGroups.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/ocrd_page_user_methods/get_ReadingOrderGroups.py diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 8a2332e6e5..fe22dd89ab 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), _add_method(r'^(PageType)$', 'get_AllTextLines'), + _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py new file mode 100644 index 0000000000..e7d6c02b77 --- /dev/null +++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py @@ -0,0 +1,33 @@ +def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) From fa0fadaa536c0daed62abb136dad9a0af15d2e5c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:58 +0200 Subject: [PATCH 041/119] update OcrdPage from generateds --- src/ocrd_models/ocrd_page_generateds.py | 55 ++++++++++++++++++++----- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index 6fef4c8635..f2b7c0551e 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,30 +2,28 @@ # -*- coding: utf-8 -*- # -# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20. -# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0] +# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: # ('-f', '') # ('--root-element', 'PcGts') -# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py') +# ('-o', 'src/ocrd_models/ocrd_page_generateds.py') # ('--silence', '') # ('--export', 'write etree') # ('--disable-generatedssuper-lookup', '') -# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py') +# ('--user-methods', 'src/ocrd_page_user_methods.py') # # Command line arguments: -# ocrd_validators/ocrd_validators/page.xsd +# src/ocrd_validators/page.xsd # # Command line: -# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd +# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd # # Current working directory (os.getcwd()): # core # -# type: ignore - from itertools import zip_longest import os import sys @@ -223,7 +221,7 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer values') + raise_parse_error(node, 'Requires sequence of integer valuess') return values def gds_format_float(self, input_data, input_name=''): return ('%.15f' % input_data).rstrip('0') @@ -1230,9 +1228,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): """ Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. @@ -3116,9 +3115,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') @@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True) ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret + def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) def set_orientation(self, orientation): """ Set deskewing angle to given `orientation` number. From 9641d4abc5436fb2925bc288790984cd0239f80b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 042/119] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 66251a54dc..9eedf9fa34 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -598,7 +598,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From 19ce7d992f567129af74f858e9f0f1ccd8482fce Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 043/119] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 415b8e6e2f..f66a1e3360 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -683,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 372f7259cc7a53d211a4ac072d91f335eeb41bf0 Mon Sep 17 00:00:00 2001 From: Markus Barth Date: Fri, 27 Sep 2024 09:48:42 +0200 Subject: [PATCH 044/119] Added space after %U in imagemagick identify format prameter. --- src/ocrd_models/ocrd_exif.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 406e60a85a..b5701771a5 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -48,11 +48,11 @@ def run_identify(self, img): for prop in ['compression', 'photometric_interpretation']: setattr(self, prop, img.info[prop] if prop in img.info else None) if img.filename: - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', img.filename], check=False, stderr=PIPE, stdout=PIPE) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], check=False, stderr=PIPE, stdout=PIPE) else: with BytesIO() as bio: img.save(bio, format=img.format) - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) if ret.returncode: stderr = ret.stderr.decode('utf-8') if 'no decode delegate for this image format' in stderr: From 44deb80434dbcf40289f7ce451e416f5f021a54d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 20:11:36 +0200 Subject: [PATCH 045/119] ocrd_exif: add multi-frame TIFF example --- tests/model/test_exif.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/model/test_exif.py b/tests/model/test_exif.py index f6771fb8ee..077247521c 100644 --- a/tests/model/test_exif.py +++ b/tests/model/test_exif.py @@ -24,7 +24,10 @@ ('leptonica_samples/data/OCR-D-IMG/OCR-D-IMG_1555_007.jpg', 944, 1472, 1, 1, 1, 'inches', 'RGB', None), ('kant_aufklaerung_1784-jp2/data/OCR-D-IMG/INPUT_0020.jp2', - 1457, 2084, 1, 1, 1, 'inches', 'RGB', None) + 1457, 2084, 1, 1, 1, 'inches', 'RGB', None), + # tolerate multi-frame TIFF: + ('gutachten/data/IMG/IMG_1.tif', + 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw') ]) def test_ocrd_exif(path, width, height, xResolution, yResolution, resolution, resolutionUnit, photometricInterpretation, compression): """Check EXIF attributes for different input formats From 606915ba9e796b7e5642ac8f6cdf86ac8bcccbf3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:02:56 +0200 Subject: [PATCH 046/119] disableLogging: clearer comment Co-authored-by: Konstantin Baierer --- src/ocrd_utils/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 8f45f9c7fc..ac2b3416a4 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -211,7 +211,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): _initialized_flag = False # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger + # remove all handlers for the 'ocrd.' and root logger for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) From 3b908a678f524b37d406022bb05b76515d8303f6 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 17:02:44 +0200 Subject: [PATCH 047/119] :memo: changelog --- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 351f5a56aa..0d759cb03d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,36 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.69.0] - 2024-09-30 + +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `disableLogging`: also re-instate root logger to Python defaults + - actually apply CLI `--log-filename`, and show in `--help` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + - :fire: `OcrdMets.add_agent` without positional arguments + +Changed: + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + * `OcrdConfig.reset_defaults` to reset config variables to their defaults + ## [2.68.0] - 2024-08-23 Changed: @@ -2164,6 +2194,7 @@ Fixed Initial Release +[2.69.0]: ../../compare/v2.69.0..v2.68.0 [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 From 343a66afcb259d0cafaffdff3e050547f9f8d314 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 17:16:54 +0200 Subject: [PATCH 048/119] :memo: changelog: remove spurious entries --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d759cb03d..88f6b6cadc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,6 @@ Fixed: - `OcrdMetsServer.add_file`: pass on `force` kwarg - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` - - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` - - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters - `lib.bash`: fix `errexit` handling - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result - `Workspace.reload_mets`: handle ClientSideOcrdMets as well @@ -24,7 +22,6 @@ Fixed: - :fire: `OcrdMets.add_agent` without positional arguments Changed: - - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) - `run_processor`: be robust if `ocrd_tool` is missing `steps` - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` From f808b726227d5502426b29dd7ab3a97af83a75e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:46:34 +0200 Subject: [PATCH 049/119] :memo: update changelog again --- CHANGELOG.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f6b6cadc..d058ebce96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,28 +9,31 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - - `OcrdMetsServer.add_file`: pass on `force` kwarg - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.workspace`: make `list-page` work w/ METS Server - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` - `lib.bash`: fix `errexit` handling - - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result - - `Workspace.reload_mets`: handle ClientSideOcrdMets as well - - `disableLogging`: also re-instate root logger to Python defaults - actually apply CLI `--log-filename`, and show in `--help` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) - - :fire: `OcrdMets.add_agent` without positional arguments + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + - `disableLogging`: also re-instate root logger to Python defaults Changed: - `run_processor`: be robust if `ocrd_tool` is missing `steps` - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + - `ClientSideOcrdMets`: use same logger name prefix as METS Server + - `Processor.zip_input_files`: when `--page-id` yields empty list, just log instead of raise Added: - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict - - ocrd.cli.workspace `server`: add subcommands `reload` and `save` - METS Server: export and delegate `physical_pages` + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` - processor CLI: delegate `--resolve-resource`, too - * `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `ocrd_utils.scale_coordinates` for resizing images ## [2.68.0] - 2024-08-23 From 4d25fcfa63c98dec7a66fcf5fdf7c959e6bb2713 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:11:18 +0200 Subject: [PATCH 050/119] update assets --- repo/assets | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/assets b/repo/assets index 05568aaa2d..ca108faf0e 160000 --- a/repo/assets +++ b/repo/assets @@ -1 +1 @@ -Subproject commit 05568aaa2dc20678bf87ffec77f3baf2924d7c24 +Subproject commit ca108faf0e95cc823a9e84cd0a1602282ae006b1 From bdfb41080a8291f3f87280669c684f8a191cb7d5 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:17:03 +0200 Subject: [PATCH 051/119] test_exif: add example provided by @mexthecat --- tests/model/test_exif.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/model/test_exif.py b/tests/model/test_exif.py index 077247521c..18c5e4c467 100644 --- a/tests/model/test_exif.py +++ b/tests/model/test_exif.py @@ -27,7 +27,10 @@ 1457, 2084, 1, 1, 1, 'inches', 'RGB', None), # tolerate multi-frame TIFF: ('gutachten/data/IMG/IMG_1.tif', - 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw') + 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw'), + # multi-frame TIFF with metric pixel density (is actually YCBCR not RGB but Pillow thinks otherwise...) + ('indian-ferns/data/OCR-D-IMG/0004.tif', + 2626, 3620, 28, 28, 28, 'cm', 'RGB', 'jpeg'), ]) def test_ocrd_exif(path, width, height, xResolution, yResolution, resolution, resolutionUnit, photometricInterpretation, compression): """Check EXIF attributes for different input formats From e6d1f857d4e53e5d9658e90dc87e761f9a13bc63 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:32:14 +0200 Subject: [PATCH 052/119] :memo: changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d058ebce96..3b1036ab84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,6 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased -## [2.69.0] - 2024-09-30 - Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` @@ -20,6 +18,7 @@ Fixed: - `Workspace.reload_mets`: handle ClientSideOcrdMets as well - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` - `disableLogging`: also re-instate root logger to Python defaults + - `OcrdExif`: handle multi-frame TIFFs gracefully in `identify` callout, #1276 Changed: - `run_processor`: be robust if `ocrd_tool` is missing `steps` From ff81c6b571852ed44523d305eb4a566e461be386 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:32:30 +0200 Subject: [PATCH 053/119] :package: v2.69.0 --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b1036ab84..34ec973570 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.69.0] - 2024-09-30 + Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` diff --git a/VERSION b/VERSION index 0f1ddc8105..a740b92f5e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.68.0 +2.69.0 From f44e28b13328f8060f921a9686ebd47aef49cb1e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:32:47 +0200 Subject: [PATCH 054/119] introduce: OCRD_NETWORK_CLIENT_POLLING_PRINT --- src/ocrd_network/client.py | 10 +++++++--- src/ocrd_network/client_utils.py | 14 +++++++++----- src/ocrd_utils/config.py | 7 ++++++- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 8ec8e541ea..c45aa3ecf3 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -19,7 +19,8 @@ def __init__( self, server_addr_processing: Optional[str], timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT, - wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP + wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP, + print_output: bool = config.OCRD_NETWORK_CLIENT_POLLING_PRINT ): self.log = getLogger(f"ocrd_network.client") if not server_addr_processing: @@ -29,6 +30,7 @@ def __init__( self.polling_timeout = timeout self.polling_wait = wait self.polling_tries = int(timeout / wait) + self.polling_print_output = print_output def check_deployed_processors(self): return get_ps_deployed_processors(ps_server_host=self.server_addr_processing) @@ -48,11 +50,13 @@ def check_workflow_status(self, workflow_job_id: str): def poll_job_status(self, job_id: str) -> str: return poll_job_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_output=self.polling_print_output) def poll_workflow_status(self, job_id: str) -> str: return poll_wf_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_output=self.polling_print_output) def send_processing_job_request(self, processor_name: str, req_params: dict) -> str: return post_ps_processing_request( diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 9b924c16a4..3ebe8d3b87 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -3,7 +3,7 @@ from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int): +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool): if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -13,18 +13,22 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries job_state = get_ps_processing_job_status(ps_server_host, job_id) if job_type == "workflow": job_state = get_ps_workflow_job_status(ps_server_host, job_id) + if print_output: + print(f"State of the {job_type} job {job_id}: {job_state}") if job_state == JobState.success or job_state == JobState.failed: break tries -= 1 return job_state -def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait) +def poll_job_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_output) -def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait) +def poll_wf_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_output) def get_ps_deployed_processors(ps_server_host: str): diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 4182456435..ab058c7830 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -160,13 +160,18 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP", description="How many seconds to sleep before trying again.", parser=int, - default=(True, 30)) + default=(True, 10)) config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT", description="Timeout for a blocking ocrd network client (in seconds).", parser=int, default=(True, 3600)) +config.add("OCRD_NETWORK_CLIENT_POLLING_PRINT", + description="Timeout for a blocking ocrd network client (in seconds).", + parser=bool, + default=(True, False)) + config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW", description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).", default=(True, '')) From 7177eb147f6234417e20dbeeba7c0f707375cd02 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:35:50 +0200 Subject: [PATCH 055/119] fix: config value description --- src/ocrd_utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index ab058c7830..03d654bc74 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -168,7 +168,7 @@ def _ocrd_download_timeout_parser(val): default=(True, 3600)) config.add("OCRD_NETWORK_CLIENT_POLLING_PRINT", - description="Timeout for a blocking ocrd network client (in seconds).", + description="Whether the blocking client commands should print status output each iteration.", parser=bool, default=(True, False)) From df8e8eede7548f74f195b884559a73b600de2f4a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:41:53 +0200 Subject: [PATCH 056/119] add default value param to preserver backwards compatibility --- src/ocrd_network/client_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 3ebe8d3b87..d3534b4b3f 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -3,7 +3,8 @@ from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool): +def _poll_endpoint_status( + ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool = False): if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -22,12 +23,12 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries def poll_job_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_output) def poll_wf_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_output) From b183cfcb007d627399b3a18e527c8a3ed298010d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:56:25 +0200 Subject: [PATCH 057/119] make -b/--block as flags --- src/ocrd_network/cli/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 9c7f15c88f..39ef62c5fe 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -104,7 +104,7 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str): @click.option('--result-queue-name') @click.option('--callback-url') @click.option('--agent-type', default='worker') -@click.option('-b', '--block', default=False, +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') def send_processing_job_request( address: Optional[str], @@ -176,7 +176,7 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-b', '--block', default=False, +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') def send_workflow_job_request( address: Optional[str], From 342ef3a78f3620ff3e63200b2a9bc4c11639c581 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 16:00:12 +0200 Subject: [PATCH 058/119] implement feedback --- src/ocrd_network/cli/client.py | 8 ++++++-- src/ocrd_network/client.py | 12 +++++------- src/ocrd_network/client_utils.py | 12 ++++++------ src/ocrd_utils/config.py | 5 ----- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 39ef62c5fe..5dd7fd0f78 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -106,6 +106,8 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str): @click.option('--agent-type', default='worker') @click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_processing_job_request( address: Optional[str], processor_name: str, @@ -146,7 +148,7 @@ def send_processing_job_request( assert processing_job_id print(f"Processing job id: {processing_job_id}") if block: - client.poll_job_status(job_id=processing_job_id) + client.poll_job_status(job_id=processing_job_id, print_state=print_state) @client_cli.group('workflow') @@ -178,6 +180,8 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): @click.option('-w', '--path-to-workflow', required=True) @click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_workflow_job_request( address: Optional[str], path_to_mets: str, @@ -192,7 +196,7 @@ def send_workflow_job_request( assert workflow_job_id print(f"Workflow job id: {workflow_job_id}") if block: - client.poll_workflow_status(job_id=workflow_job_id) + client.poll_workflow_status(job_id=workflow_job_id, print_state=print_state) @client_cli.group('workspace') diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index c45aa3ecf3..5a6831bea7 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -19,8 +19,7 @@ def __init__( self, server_addr_processing: Optional[str], timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT, - wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP, - print_output: bool = config.OCRD_NETWORK_CLIENT_POLLING_PRINT + wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP ): self.log = getLogger(f"ocrd_network.client") if not server_addr_processing: @@ -30,7 +29,6 @@ def __init__( self.polling_timeout = timeout self.polling_wait = wait self.polling_tries = int(timeout / wait) - self.polling_print_output = print_output def check_deployed_processors(self): return get_ps_deployed_processors(ps_server_host=self.server_addr_processing) @@ -48,15 +46,15 @@ def check_job_status(self, job_id: str): def check_workflow_status(self, workflow_job_id: str): return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id) - def poll_job_status(self, job_id: str) -> str: + def poll_job_status(self, job_id: str, print_state: bool) -> str: return poll_job_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, - print_output=self.polling_print_output) + print_state=print_state) - def poll_workflow_status(self, job_id: str) -> str: + def poll_workflow_status(self, job_id: str, print_state: bool) -> str: return poll_wf_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, - print_output=self.polling_print_output) + print_state=print_state) def send_processing_job_request(self, processor_name: str, req_params: dict) -> str: return post_ps_processing_request( diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index d3534b4b3f..87649d5ad4 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -4,7 +4,7 @@ def _poll_endpoint_status( - ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool = False): + ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_state: bool = False): if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -14,7 +14,7 @@ def _poll_endpoint_status( job_state = get_ps_processing_job_status(ps_server_host, job_id) if job_type == "workflow": job_state = get_ps_workflow_job_status(ps_server_host, job_id) - if print_output: + if print_state: print(f"State of the {job_type} job {job_id}: {job_state}") if job_state == JobState.success or job_state == JobState.failed: break @@ -23,13 +23,13 @@ def _poll_endpoint_status( def poll_job_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_output) + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_state) def poll_wf_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_output) + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_state) def get_ps_deployed_processors(ps_server_host: str): diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 03d654bc74..d2cc4efce1 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -167,11 +167,6 @@ def _ocrd_download_timeout_parser(val): parser=int, default=(True, 3600)) -config.add("OCRD_NETWORK_CLIENT_POLLING_PRINT", - description="Whether the blocking client commands should print status output each iteration.", - parser=bool, - default=(True, False)) - config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW", description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).", default=(True, '')) From 0e80a7cf84a5db1073ea5ba1363819ed40d16020 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 16:02:30 +0200 Subject: [PATCH 059/119] fix: missed params --- src/ocrd_network/cli/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 5dd7fd0f78..fd28552866 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -122,7 +122,8 @@ def send_processing_job_request( # TODO: This is temporally available to toggle # between the ProcessingWorker/ProcessorServer agent_type: Optional[str], - block: Optional[bool] + block: Optional[bool], + print_state: Optional[bool] ): """ Submit a processing job to the processing server. @@ -186,7 +187,8 @@ def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, - block: Optional[bool] + block: Optional[bool], + print_state: Optional[bool] ): """ Submit a workflow job to the processing server. From d7df20049fe3175e001a1feb60ec42b17ee3a2f0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 16:08:57 +0200 Subject: [PATCH 060/119] fix: integration client tests --- src/ocrd_network/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 5a6831bea7..c4315ded4d 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -46,12 +46,12 @@ def check_job_status(self, job_id: str): def check_workflow_status(self, workflow_job_id: str): return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id) - def poll_job_status(self, job_id: str, print_state: bool) -> str: + def poll_job_status(self, job_id: str, print_state: bool = False) -> str: return poll_job_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, print_state=print_state) - def poll_workflow_status(self, job_id: str, print_state: bool) -> str: + def poll_workflow_status(self, job_id: str, print_state: bool = False) -> str: return poll_wf_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, print_state=print_state) From 0bfef64ec694e6695f1c95a5fab343c268b25ec0 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 1 Oct 2024 16:25:43 +0200 Subject: [PATCH 061/119] post_ps_workflow_request: pagewise configurable --- src/ocrd_network/cli/client.py | 20 +++++++++++++++++--- src/ocrd_network/client_utils.py | 26 +++++++++++++++++--------- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 9c7f15c88f..a57cb88b82 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -2,6 +2,7 @@ from json import dumps from typing import List, Optional, Tuple from ocrd.decorators.parameter_option import parameter_option, parameter_override_option +from ocrd_network.constants import JobState from ocrd_utils import DEFAULT_METS_BASENAME from ocrd_utils.introspect import set_json_key_value_overrides from ocrd_utils.str import parse_json_string_or_file @@ -176,23 +177,36 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-b', '--block', default=False, +@click.option('-p/-P', '--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") +@click.option('-b', '--block', is_flag=True, default=False, help='If set, the client will block till job timeout, fail or success.') def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, + page_wise : bool, block: Optional[bool] ): """ Submit a workflow job to the processing server. """ client = Client(server_addr_processing=address) - workflow_job_id = client.send_workflow_job_request(path_to_wf=path_to_workflow, path_to_mets=path_to_mets) + workflow_job_id = client.send_workflow_job_request( + path_to_wf=path_to_workflow, + path_to_mets=path_to_mets, + page_wise=page_wise, + ) assert workflow_job_id print(f"Workflow job id: {workflow_job_id}") if block: - client.poll_workflow_status(job_id=workflow_job_id) + print(f"Polling state of workflow job {workflow_job_id}") + state = client.poll_workflow_status(job_id=workflow_job_id) + if state != JobState.success: + print(f"Workflow failed with {state}") + exit(1) + else: + print(f"Workflow succeeded") + exit(0) @client_cli.group('workspace') diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 9b924c16a4..24f3da105c 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -1,9 +1,10 @@ +import json from requests import get as request_get, post as request_post from time import sleep from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int): +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int) -> JobState: if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -47,22 +48,21 @@ def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str): return response -def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str: +def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> JobState: request_url = f"{ps_server_host}/processor/job/{processing_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state - + return getattr(JobState, job_state.lower()) -def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str: +def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> JobState: request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state + return getattr(JobState, job_state.lower()) def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str: @@ -79,8 +79,13 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d # TODO: Can be extended to include other parameters such as page_wise -def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str: - request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True" +def post_ps_workflow_request( + ps_server_host: str, + path_to_wf: str, + path_to_mets: str, + page_wise : bool, +) -> str: + request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( url=request_url, headers={"accept": "application/json; charset=utf-8"}, @@ -88,8 +93,11 @@ def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: ) # print(response.json()) # print(response.__dict__) + json_resp_raw = response.text + # print(f'post_ps_workflow_request >> {response.status_code}') + # print(f'post_ps_workflow_request >> {json_resp_raw}') assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" - wf_job_id = response.json()["job_id"] + wf_job_id = json.loads(json_resp_raw)["job_id"] assert wf_job_id return wf_job_id From 1f5c4bbb756d05c55758968a610aa810111cbf48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:16:17 +0200 Subject: [PATCH 062/119] Dockerfile.cuda-torch: do NOT rm /build/core since we installed core in editable mode! --- Dockerfile.cuda-torch | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile.cuda-torch b/Dockerfile.cuda-torch index 8d6c3aa624..59ce1144be 100644 --- a/Dockerfile.cuda-torch +++ b/Dockerfile.cuda-torch @@ -9,7 +9,5 @@ RUN make deps-torch WORKDIR /data -RUN rm -fr /build - CMD ["/usr/local/bin/ocrd", "--help"] From 611b6b566e565873648c4a112adbb6d8bedc155d Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 1 Oct 2024 18:01:30 +0200 Subject: [PATCH 063/119] deployer: Remove any pre-existing socket file before starting the server (again) --- src/ocrd_network/runtime_data/deployer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index b956904d07..7b064961c5 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -146,6 +146,11 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: if is_mets_server_running(mets_server_url=str(mets_server_url)): self.log.debug(f"The UDS mets server for {ws_dir_path} is already started: {mets_server_url}") return mets_server_url + elif Path(mets_server_url).is_socket(): + self.log.warning( + f"The UDS mets server for {ws_dir_path} is not running but the socket file exists: {mets_server_url}." + "Removing to avoid any weird behavior before starting the server.") + Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) self.mets_servers[mets_server_url] = pid From 9a71d048dd8ddc1dceba3fa24d34af719690eaf5 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:11:07 +0200 Subject: [PATCH 064/119] remove UDS socket files --- src/ocrd/mets_server.py | 2 +- src/ocrd_network/runtime_data/deployer.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index c85368e305..a8f766289c 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -434,7 +434,7 @@ def kill_process(mets_server_pid: int): def shutdown(self): if self.is_uds: if Path(self.url).exists(): - self.log.debug(f'UDS socket {self.url} still exists, removing it') + self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") Path(self.url).unlink() # os._exit because uvicorn catches SystemExit raised by sys.exit _exit(0) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 7b064961c5..90f7c6d5c7 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -165,6 +165,9 @@ def stop_uds_mets_server(self, mets_server_url: str, stop_with_pid: bool = False raise Exception(message) mets_server_pid = self.mets_servers[Path(mets_server_url)] OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) + if Path(mets_server_url).exists(): + self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url}") + Path(mets_server_url).unlink() return # TODO: Reconsider this again # Not having this sleep here causes connection errors From 854403de6ea880c31b82463bba3850c07565327d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:38:07 +0200 Subject: [PATCH 065/119] remove shortcuts for page-wise --- src/ocrd_network/cli/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 6733f893aa..450cce43fb 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -180,7 +180,7 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-p/-P', '--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") +@click.option('--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") @click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') @click.option('-p', '--print-state', default=False, is_flag=True, From 4d01e66229bcd63872f4fd93699aa0084792c02c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:40:19 +0200 Subject: [PATCH 066/119] fix: pass page-wise argument to relevant methods --- src/ocrd_network/cli/client.py | 2 +- src/ocrd_network/client.py | 5 +++-- src/ocrd_network/client_utils.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 450cce43fb..350cf64b90 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -189,7 +189,7 @@ def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, - page_wise : bool, + page_wise: bool, block: bool, print_state: bool ): diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index c4315ded4d..1521997942 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -60,6 +60,7 @@ def send_processing_job_request(self, processor_name: str, req_params: dict) -> return post_ps_processing_request( ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params) - def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str): + def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool): return post_ps_workflow_request( - ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets) + ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets, + page_wise=page_wise) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index b23442e502..456398ecf8 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -87,7 +87,7 @@ def post_ps_workflow_request( ps_server_host: str, path_to_wf: str, path_to_mets: str, - page_wise : bool, + page_wise: bool, ) -> str: request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( From 97427e07326bddc0ff83e4d1ed5eba4cb6631829 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:42:00 +0200 Subject: [PATCH 067/119] Update src/ocrd_network/client_utils.py Co-authored-by: Konstantin Baierer --- src/ocrd_network/client_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 456398ecf8..51db2681a6 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -82,7 +82,6 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d return processing_job_id -# TODO: Can be extended to include other parameters such as page_wise def post_ps_workflow_request( ps_server_host: str, path_to_wf: str, From 745484588ab9c77481397a9daaabee086f7790ee Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:07:19 +0200 Subject: [PATCH 068/119] add endpoint DELETE /workflow/kill-mets-server-zombies to kill -SIGTERM METS servers with ctime > 60mins ago --- src/ocrd/mets_server.py | 5 ++-- src/ocrd_network/processing_server.py | 12 ++++++++++ src/ocrd_network/server_utils.py | 33 +++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index c85368e305..c46a99a2d8 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -1,8 +1,10 @@ """ # METS server functionality """ +import os import re from os import _exit, chmod +import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path @@ -428,8 +430,7 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return + return os.kill(mets_server_pid, signal.SIGTERM) def shutdown(self): if self.is_uds: diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..29061c5645 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -48,6 +48,7 @@ get_workflow_content, get_from_database_workspace, get_from_database_workflow_job, + kill_mets_server_zombies, parse_workflow_tasks, raise_http_exception, request_processor_server_tool_json, @@ -314,6 +315,14 @@ def add_api_routes_workflow(self): status_code=status.HTTP_200_OK, summary="Get information about a workflow run" ) + workflow_router.add_api_route( + path="/workflow/kill-mets-server-zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(workflow_router) async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict: @@ -817,6 +826,9 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response + async def kill_mets_server_zombies(self) -> None: + kill_mets_server_zombies(minutes_ago=60) + async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ Simplified version of the `get_workflow_info` that returns a single state for the entire workflow. diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 9d8628170c..1897f3a62e 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -1,12 +1,18 @@ +import os +import re +import signal +from pathlib import Path +from json import dumps, loads +from urllib.parse import urljoin +from typing import Dict, List, Union +from time import time + from fastapi import HTTPException, status, UploadFile from fastapi.responses import FileResponse from httpx import AsyncClient, Timeout -from json import dumps, loads from logging import Logger -from pathlib import Path from requests import get as requests_get -from typing import Dict, List, Union -from urllib.parse import urljoin +from requests_unixsocket import sys from ocrd.resolver import Resolver from ocrd.task_sequence import ProcessorTask @@ -241,3 +247,22 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s if group not in available_groups: message = f"Input file group '{group}' of the first processor not found: {input_file_grps}" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) + + +def kill_mets_server_zombies(minutes_ago=60): + now = time() + cmdline_pat = r'.*ocrd workspace -U.*server start $' + for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): + if not procdir.is_dir(): + continue + cmdline_file = procdir.joinpath('cmdline') + if not cmdline_file.is_file(): + continue + ctime_ago = int((now - procdir.stat().st_ctime) / 60) + if ctime_ago < minutes_ago: + continue + cmdline = cmdline_file.read_text().replace('\x00', ' ') + if re.match(cmdline_pat, cmdline): + pid = procdir.name + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + os.kill(int(pid), signal.SIGTERM) From 0506e9d5f5edca7e7f6198ad93c0ac4a04f0061d Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:28:45 +0200 Subject: [PATCH 069/119] move mets-zombie killer to / and return list of killed PIDs --- src/ocrd_network/processing_server.py | 21 +++++++++++---------- src/ocrd_network/server_utils.py | 5 ++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 29061c5645..04305a6fbb 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -201,6 +201,14 @@ def add_api_routes_others(self): tags=[ServerApiTags.WORKSPACE], summary="Forward a TCP request to UDS mets server" ) + others_router.add_api_route( + path="/kill-mets-server-zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(others_router) def add_api_routes_processing(self): @@ -315,14 +323,6 @@ def add_api_routes_workflow(self): status_code=status.HTTP_200_OK, summary="Get information about a workflow run" ) - workflow_router.add_api_route( - path="/workflow/kill-mets-server-zombies", - endpoint=self.kill_mets_server_zombies, - methods=["DELETE"], - tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], - status_code=status.HTTP_200_OK, - summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." - ) self.include_router(workflow_router) async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict: @@ -826,8 +826,9 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response - async def kill_mets_server_zombies(self) -> None: - kill_mets_server_zombies(minutes_ago=60) + async def kill_mets_server_zombies(self) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=60) + return pids_killed async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 1897f3a62e..b143e344af 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -249,9 +249,10 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago=60): +def kill_mets_server_zombies(minutes_ago=60) -> list[int]: now = time() cmdline_pat = r'.*ocrd workspace -U.*server start $' + ret = [] for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): if not procdir.is_dir(): continue @@ -264,5 +265,7 @@ def kill_mets_server_zombies(minutes_ago=60): cmdline = cmdline_file.read_text().replace('\x00', ' ') if re.match(cmdline_pat, cmdline): pid = procdir.name + ret.append(pid) print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) os.kill(int(pid), signal.SIGTERM) + return ret From ad81356d32178c53814ff1293f35d3dd7827b793 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:31:56 +0200 Subject: [PATCH 070/119] /kill_mets_server_zombies use underscores not slashes --- src/ocrd_network/processing_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 04305a6fbb..505e106ba2 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -202,7 +202,7 @@ def add_api_routes_others(self): summary="Forward a TCP request to UDS mets server" ) others_router.add_api_route( - path="/kill-mets-server-zombies", + path="/kill_mets_server_zombies", endpoint=self.kill_mets_server_zombies, methods=["DELETE"], tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], From 4862d72fe6f7149ff4ce97d56ac870837bafddc5 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:41:32 +0200 Subject: [PATCH 071/119] use 3.8 compatible typing --- src/ocrd_network/server_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index b143e344af..773668f5b7 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -249,7 +249,7 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago=60) -> list[int]: +def kill_mets_server_zombies(minutes_ago=60) -> List[int]: now = time() cmdline_pat = r'.*ocrd workspace -U.*server start $' ret = [] From 4f6775f358fdf0c7d3164d30e01ecb63106b4a6a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 2 Oct 2024 15:13:38 +0200 Subject: [PATCH 072/119] OcrdMetsServer.kill_process: try the easy way (SIGINT) then the hard way (SIGKILL) --- src/ocrd/mets_server.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b6a8f140ba..4b4ffa728f 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -430,7 +430,12 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - return os.kill(mets_server_pid, signal.SIGTERM) + os.kill(mets_server_pid, signal.SIGINT) + sleep(3) + try: + os.kill(mets_server_pid, signal.SIGKILL) + except ProcessLookupError as e: + pass def shutdown(self): if self.is_uds: From 3882e7abf397650ece1e36798232cb148922a43d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 15:17:46 +0200 Subject: [PATCH 073/119] fix: add default to page_wise param --- src/ocrd_network/client.py | 2 +- src/ocrd_network/client_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 1521997942..bb7cf4dbf2 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -60,7 +60,7 @@ def send_processing_job_request(self, processor_name: str, req_params: dict) -> return post_ps_processing_request( ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params) - def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool): + def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool = False): return post_ps_workflow_request( ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets, page_wise=page_wise) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 51db2681a6..4eaf4ea95b 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -86,7 +86,7 @@ def post_ps_workflow_request( ps_server_host: str, path_to_wf: str, path_to_mets: str, - page_wise: bool, + page_wise: bool = False, ) -> str: request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( From 7b6552b0c7e213fcd0c4d6879c7e65d411445aca Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 13:36:01 +0200 Subject: [PATCH 074/119] previous state --- src/ocrd/mets_server.py | 10 +++++-- src/ocrd_network/processing_server.py | 20 +++++++++++-- src/ocrd_network/runtime_data/deployer.py | 32 ++++++++++++-------- src/ocrd_network/server_utils.py | 36 ++++++++++++++++++++--- src/ocrd_network/utils.py | 4 +-- 5 files changed, 79 insertions(+), 23 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index a8f766289c..4b4ffa728f 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -1,8 +1,10 @@ """ # METS server functionality """ +import os import re from os import _exit, chmod +import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path @@ -428,8 +430,12 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return + os.kill(mets_server_pid, signal.SIGINT) + sleep(3) + try: + os.kill(mets_server_pid, signal.SIGKILL) + except ProcessLookupError as e: + pass def shutdown(self): if self.is_uds: diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..50078be377 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -48,6 +48,7 @@ get_workflow_content, get_from_database_workspace, get_from_database_workflow_job, + kill_mets_server_zombies, parse_workflow_tasks, raise_http_exception, request_processor_server_tool_json, @@ -200,6 +201,14 @@ def add_api_routes_others(self): tags=[ServerApiTags.WORKSPACE], summary="Forward a TCP request to UDS mets server" ) + others_router.add_api_route( + path="/kill_mets_server_zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(others_router) def add_api_routes_processing(self): @@ -574,7 +583,7 @@ async def _cancel_cached_dependent_jobs(self, workspace_key: str, job_id: str) - ) async def _consume_cached_jobs_of_workspace( - self, workspace_key: str, mets_server_url: str + self, workspace_key: str, mets_server_url: str, path_to_mets: str ) -> List[PYJobInput]: # Check whether the internal queue for the workspace key still exists @@ -593,7 +602,8 @@ async def _consume_cached_jobs_of_workspace( # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url) + self.deployer.stop_uds_mets_server( + mets_server_url=mets_server_url, path_to_mets=path_to_mets, stop_with_pid=True) try: # The queue is empty - delete it @@ -643,7 +653,7 @@ async def remove_job_from_request_cache(self, result_message: PYResultMessage): raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message, error) consumed_cached_jobs = await self._consume_cached_jobs_of_workspace( - workspace_key=workspace_key, mets_server_url=mets_server_url + workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets ) await self.push_cached_jobs_to_agents(processing_jobs=consumed_cached_jobs) @@ -817,6 +827,10 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response + async def kill_mets_server_zombies(self) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=60) + return pids_killed + async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ Simplified version of the `get_workflow_info` that returns a single state for the entire workflow. diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 90f7c6d5c7..f60194ce4e 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -8,7 +8,6 @@ """ from __future__ import annotations from pathlib import Path -from subprocess import Popen, run as subprocess_run from time import sleep from typing import Dict, List, Union @@ -30,6 +29,8 @@ def __init__(self, config_path: str) -> None: self.data_hosts: List[DataHost] = parse_hosts_data(ps_config["hosts"]) self.internal_callback_url = ps_config.get("internal_callback_url", None) self.mets_servers: Dict = {} # {"mets_server_url": "mets_server_pid"} + # This is required to store UDS urls that are multiplexed through the TCP proxy and are not preserved anywhere + self.mets_servers_paths: Dict = {} # {"ws_dir_path": "mets_server_url"} self.use_tcp_mets = ps_config.get("use_tcp_mets", False) # TODO: Reconsider this. @@ -153,26 +154,33 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) - self.mets_servers[mets_server_url] = pid + self.mets_servers[str(mets_server_url)] = pid + self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url - def stop_uds_mets_server(self, mets_server_url: str, stop_with_pid: bool = False) -> None: + def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_with_pid: bool = False) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") + self.log.info(f"Path to the mets file: {path_to_mets}") + self.log.info(f"mets_server: {self.mets_servers}") + self.log.info(f"mets_server_paths: {self.mets_servers_paths}") if stop_with_pid: - if Path(mets_server_url) not in self.mets_servers: - message = f"UDS Mets server not found at URL: {mets_server_url}" - self.log.exception(message) - raise Exception(message) - mets_server_pid = self.mets_servers[Path(mets_server_url)] + mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] + if Path(mets_server_url_uds) not in self.mets_servers: + message = f"UDS Mets server not found at URL: {mets_server_url_uds}, mets path: {path_to_mets}" + self.log.warning(message) + mets_server_pid = self.mets_servers[str(mets_server_url_uds)] + self.log.info(f"Killing mets server pid: {mets_server_pid} of {mets_server_url_uds}") OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) - if Path(mets_server_url).exists(): - self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url}") - Path(mets_server_url).unlink() + self.log.info(f"Returning after the kill process") + if Path(mets_server_url_uds).exists(): + self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url_uds}") + Path(mets_server_url_uds).unlink() + self.log.info(f"Returning from the stop_uds_mets_server") return # TODO: Reconsider this again # Not having this sleep here causes connection errors # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) - stop_mets_server(mets_server_url=mets_server_url) + stop_mets_server(mets_server_url=mets_server_url, ws_dir_path=Path(path_to_mets).parent) return diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 9d8628170c..773668f5b7 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -1,12 +1,18 @@ +import os +import re +import signal +from pathlib import Path +from json import dumps, loads +from urllib.parse import urljoin +from typing import Dict, List, Union +from time import time + from fastapi import HTTPException, status, UploadFile from fastapi.responses import FileResponse from httpx import AsyncClient, Timeout -from json import dumps, loads from logging import Logger -from pathlib import Path from requests import get as requests_get -from typing import Dict, List, Union -from urllib.parse import urljoin +from requests_unixsocket import sys from ocrd.resolver import Resolver from ocrd.task_sequence import ProcessorTask @@ -241,3 +247,25 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s if group not in available_groups: message = f"Input file group '{group}' of the first processor not found: {input_file_grps}" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) + + +def kill_mets_server_zombies(minutes_ago=60) -> List[int]: + now = time() + cmdline_pat = r'.*ocrd workspace -U.*server start $' + ret = [] + for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): + if not procdir.is_dir(): + continue + cmdline_file = procdir.joinpath('cmdline') + if not cmdline_file.is_file(): + continue + ctime_ago = int((now - procdir.stat().st_ctime) / 60) + if ctime_ago < minutes_ago: + continue + cmdline = cmdline_file.read_text().replace('\x00', ' ') + if re.match(cmdline_pat, cmdline): + pid = procdir.name + ret.append(pid) + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + os.kill(int(pid), signal.SIGTERM) + return ret diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index a2f563de43..13bbee7dbb 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -151,7 +151,7 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(mets_server_url: str, ws_dir_path: Path = None) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" session = Session_TCP() if protocol == "tcp" else Session_UDS() if protocol == "uds": @@ -160,7 +160,7 @@ def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: if 'tcp_mets' in mets_server_url: if not ws_dir_path: return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) + response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(str(ws_dir_path))) else: response = session.delete(url=f"{mets_server_url}/") except Exception: From 637a40e452b981d7cc8b74937bc149a568efcb68 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 13:55:49 +0200 Subject: [PATCH 075/119] do not use pid killing --- src/ocrd_network/processing_server.py | 3 +-- src/ocrd_network/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 50078be377..edae6733c0 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -602,8 +602,7 @@ async def _consume_cached_jobs_of_workspace( # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - self.deployer.stop_uds_mets_server( - mets_server_url=mets_server_url, path_to_mets=path_to_mets, stop_with_pid=True) + self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets) try: # The queue is empty - delete it diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 13bbee7dbb..a2f563de43 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -151,7 +151,7 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: Path = None) -> bool: +def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" session = Session_TCP() if protocol == "tcp" else Session_UDS() if protocol == "uds": @@ -160,7 +160,7 @@ def stop_mets_server(mets_server_url: str, ws_dir_path: Path = None) -> bool: if 'tcp_mets' in mets_server_url: if not ws_dir_path: return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(str(ws_dir_path))) + response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) else: response = session.delete(url=f"{mets_server_url}/") except Exception: From 387dc3085ebe831fd1beb3937f7a8b4b60197123 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 18:04:26 +0200 Subject: [PATCH 076/119] add logger param to stop mets server --- src/ocrd_network/processing_server.py | 1 - src/ocrd_network/runtime_data/deployer.py | 2 +- src/ocrd_network/utils.py | 11 +++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index edae6733c0..59243d52fe 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -601,7 +601,6 @@ async def _consume_cached_jobs_of_workspace( # Shut down the Mets Server for the workspace_key since no # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets) try: diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index f60194ce4e..16207154b3 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -182,5 +182,5 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) - stop_mets_server(mets_server_url=mets_server_url, ws_dir_path=Path(path_to_mets).parent) + stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=str(Path(path_to_mets).parent)) return diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index a2f563de43..7747e5ea6f 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -4,6 +4,7 @@ from functools import wraps from hashlib import md5 from json import loads +from logging import Logger from pathlib import Path from re import compile as re_compile, split as re_split from requests import get as requests_get, Session as Session_TCP @@ -151,7 +152,7 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str = None) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" session = Session_TCP() if protocol == "tcp" else Session_UDS() if protocol == "uds": @@ -159,9 +160,15 @@ def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: try: if 'tcp_mets' in mets_server_url: if not ws_dir_path: + logger.warning("Multiplexing through the Processing Server to reach a mets server but no workspace " + "path is specified. There is no way for the Processing Server to know to which Mets " + "Server the incoming requests should be forwarded.") return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) + request_json = MpxReq.stop(ws_dir_path) + logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") + response = session.post(url=f"{mets_server_url}", json=request_json) else: + logger.info(f"Sending DELETE request to: {mets_server_url}/") response = session.delete(url=f"{mets_server_url}/") except Exception: return False From 07953f76042f977a9a60df70da8f30688357bde9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 18:06:21 +0200 Subject: [PATCH 077/119] add extensive logging to mets proxy --- src/ocrd_network/tcp_to_uds_mets_proxy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index 176f4f1442..4fa2f3ea70 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -34,6 +34,10 @@ def forward_tcp_request(self, request_body) -> Dict: ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}' uds_request_url = f"{ws_unix_socket_url}/{request_url}" + self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}") + self.log.info(f"Forwarding method type {method_type}, request data: {request_data}, " + f"expected response type: {response_type}") + if not request_data: response = self.session.request(method_type, uds_request_url) elif "params" in request_data: From 3a9e1479f722465452d70905466418d09ff2f4f7 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 18:12:13 +0200 Subject: [PATCH 078/119] return empty response type earlier --- src/ocrd_network/tcp_to_uds_mets_proxy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index 4fa2f3ea70..e110978713 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -49,12 +49,11 @@ def forward_tcp_request(self, request_body) -> Dict: else: raise ValueError("Expecting request_data to be empty or containing single key: params," f"form, or class but not {request_data.keys}") - + if response_type == "empty": + return {} if not response: self.log.error(f"Uds-Mets-Server gives unexpected error. Response: {response.__dict__}") return {"error": response.text} - elif response_type == "empty": - return {} elif response_type == "text": return {"text": response.text} elif response_type == "class" or response_type == "dict": From 00655b82f0409b4811324cf40a788e83ae9dd6c8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:02:00 +0200 Subject: [PATCH 079/119] fix: change UDS file deletion place --- src/ocrd/mets_server.py | 11 +++--- src/ocrd_network/tcp_to_uds_mets_proxy.py | 4 +-- src/ocrd_network/utils.py | 42 +++++++++++++---------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 4b4ffa728f..f3dfd5ea64 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -437,11 +437,8 @@ def kill_process(mets_server_pid: int): except ProcessLookupError as e: pass - def shutdown(self): - if self.is_uds: - if Path(self.url).exists(): - self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") - Path(self.url).unlink() + @staticmethod + def shutdown(): # os._exit because uvicorn catches SystemExit raised by sys.exit _exit(0) @@ -472,7 +469,8 @@ def save(): """ Write current changes to the file system """ - return workspace.save_mets() + workspace.save_mets() + return Response(status_code=200, content="The Mets Server is writing changes to disk.") @app.delete(path='/') async def stop(): @@ -482,6 +480,7 @@ async def stop(): getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}') workspace.save_mets() self.shutdown() + return Response(status_code=200, content="The Mets Server is shutting down...") @app.post(path='/reload') async def workspace_reload_mets(): diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index e110978713..3f335435ab 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -1,5 +1,5 @@ from requests_unixsocket import Session as requests_unixsocket_session -from .utils import get_uds_path +from .utils import get_uds_path, convert_url_to_uds_format from typing import Dict from ocrd_utils import getLogger @@ -31,7 +31,7 @@ def forward_tcp_request(self, request_body) -> Dict: if method_type not in SUPPORTED_METHOD_TYPES: raise NotImplementedError(f"Method type: {method_type} not recognized") ws_socket_file = str(get_uds_path(ws_dir_path=ws_dir_path)) - ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}' + ws_unix_socket_url = convert_url_to_uds_format(ws_socket_file) uds_request_url = f"{ws_unix_socket_url}/{request_url}" self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}") diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 7747e5ea6f..eebb5a3ba1 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -152,28 +152,32 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" - session = Session_TCP() if protocol == "tcp" else Session_UDS() + # If the mets server URL is the proxy endpoint + if protocol == "tcp" and "tcp_mets" in mets_server_url: + # Convert the mets server url to UDS format + ws_socket_file = str(get_uds_path(ws_dir_path)) + mets_server_url = convert_url_to_uds_format(ws_socket_file) + protocol = "uds" + if protocol == "tcp": + request_json = MpxReq.stop(ws_dir_path) + logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") + response = Session_TCP().post(url=f"{mets_server_url}", json=request_json) + return response.status_code == 200 + elif protocol == "uds": + logger.info(f"Sending DELETE request to: {mets_server_url}/") + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 + else: + ValueError(f"Unexpected protocol type: {protocol}") if protocol == "uds": - mets_server_url = convert_url_to_uds_format(mets_server_url) - try: - if 'tcp_mets' in mets_server_url: - if not ws_dir_path: - logger.warning("Multiplexing through the Processing Server to reach a mets server but no workspace " - "path is specified. There is no way for the Processing Server to know to which Mets " - "Server the incoming requests should be forwarded.") - return False - request_json = MpxReq.stop(ws_dir_path) - logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") - response = session.post(url=f"{mets_server_url}", json=request_json) + ws_socket_file = str(get_uds_path(ws_dir_path)) + if Path(ws_socket_file).exists(): + logger.info(f"Removing the inactive UDS file: {ws_socket_file}") + Path(ws_socket_file).unlink() else: - logger.info(f"Sending DELETE request to: {mets_server_url}/") - response = session.delete(url=f"{mets_server_url}/") - except Exception: - return False - return response.status_code == 200 - + logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") From 810f8111a6a85db465a6becad0ca721d91ed4b73 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:22:50 +0200 Subject: [PATCH 080/119] return response from mets server before dying --- src/ocrd/mets_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index f3dfd5ea64..b5773d978e 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -440,7 +440,8 @@ def kill_process(mets_server_pid: int): @staticmethod def shutdown(): # os._exit because uvicorn catches SystemExit raised by sys.exit - _exit(0) + # _exit(0) + os.kill(os.getpid(), signal.SIGTERM) def startup(self): self.log.info("Starting up METS server") From 4970e6238cd51d03abc358b82cb8b100175061f1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:23:25 +0200 Subject: [PATCH 081/119] fix: remove UDS file correctly --- src/ocrd_network/utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index eebb5a3ba1..3dfa71e5f3 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -167,17 +167,19 @@ def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> return response.status_code == 200 elif protocol == "uds": logger.info(f"Sending DELETE request to: {mets_server_url}/") - response = Session_UDS().delete(url=f"{mets_server_url}/") - return response.status_code == 200 + try: + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 + finally: + if protocol == "uds": + ws_socket_file = str(get_uds_path(ws_dir_path)) + if Path(ws_socket_file).exists(): + logger.info(f"Removing the inactive UDS file: {ws_socket_file}") + Path(ws_socket_file).unlink() + else: + logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") else: ValueError(f"Unexpected protocol type: {protocol}") - if protocol == "uds": - ws_socket_file = str(get_uds_path(ws_dir_path)) - if Path(ws_socket_file).exists(): - logger.info(f"Removing the inactive UDS file: {ws_socket_file}") - Path(ws_socket_file).unlink() - else: - logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") From 906766d38f4dcc583511f51fbe6d9b39b48ab74c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:33:52 +0200 Subject: [PATCH 082/119] comment out irrelevant code --- src/ocrd/mets_server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b5773d978e..b8bd99b6a3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -444,7 +444,7 @@ def shutdown(): os.kill(os.getpid(), signal.SIGTERM) def startup(self): - self.log.info("Starting up METS server") + self.log.info(f"Starting up METS server: {self.url}") workspace = self.workspace @@ -564,9 +564,12 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + # TODO: Not required after #1284, consider removing + """ if Path(self.url).exists() and not is_socket_in_use(self.url): # remove leftover unused socket which blocks startup Path(self.url).unlink() + """ server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() @@ -581,7 +584,7 @@ async def add_file( self.log.debug("Starting uvicorn") uvicorn.run(app, **uvicorn_kwargs) - +# TODO: Not required after #1284, consider removing def is_socket_in_use(socket_path): if Path(socket_path).exists(): client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) From a87a2e111a681ebed356401e75560edd5cd1ba7b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:04:20 +0200 Subject: [PATCH 083/119] fix: no more zombies, yay! --- src/ocrd_network/runtime_data/deployer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 16207154b3..7aec568071 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -8,6 +8,7 @@ """ from __future__ import annotations from pathlib import Path +import psutil from time import sleep from typing import Dict, List, Union @@ -182,5 +183,13 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) + mets_server_pid = self.mets_servers[str(self.mets_servers_paths[str(Path(path_to_mets).parent)])] + self.log.info(f"Terminating mets server with pid: {mets_server_pid}") + p = psutil.Process(mets_server_pid) stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=str(Path(path_to_mets).parent)) + if p.is_running(): + p.wait() + self.log.info(f"Terminated mets server with pid: {mets_server_pid}") + else: + self.log.info(f"Mets server has already terminated with pid: {mets_server_pid}") return From e0ff4ebd3ea200a73b200375e75a4886eb1941fc Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:37:39 +0200 Subject: [PATCH 084/119] add: extensive logging of mets server to file --- src/ocrd/mets_server.py | 56 ++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b8bd99b6a3..c6448b1d81 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -437,14 +437,15 @@ def kill_process(mets_server_pid: int): except ProcessLookupError as e: pass - @staticmethod - def shutdown(): + def shutdown(self): # os._exit because uvicorn catches SystemExit raised by sys.exit # _exit(0) - os.kill(os.getpid(), signal.SIGTERM) + pid = os.getpid() + self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") + os.kill(pid, signal.SIGTERM) def startup(self): - self.log.info(f"Starting up METS server: {self.url}") + self.log.info(f"Configuring up the Mets Server") workspace = self.workspace @@ -471,17 +472,20 @@ def save(): Write current changes to the file system """ workspace.save_mets() - return Response(status_code=200, content="The Mets Server is writing changes to disk.") + response = Response(content="The Mets Server is writing changes to disk.", media_type='text/plain') + self.log.info(f"PUT / -> {response.__dict__}") + return response @app.delete(path='/') async def stop(): """ Stop the mets server """ - getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}') workspace.save_mets() + response = Response(content="The Mets Server will shut down soon...", media_type='text/plain') self.shutdown() - return Response(status_code=200, content="The Mets Server is shutting down...") + self.log.info(f"POST /reload -> {response.__dict__}") + return response @app.post(path='/reload') async def workspace_reload_mets(): @@ -489,34 +493,48 @@ async def workspace_reload_mets(): Reload mets file from the file system """ workspace.reload_mets() - return Response(content=f'Reloaded from {workspace.directory}', media_type="text/plain") + response = Response(content=f"Reloaded from {workspace.directory}", media_type='text/plain') + self.log.info(f"POST /reload -> {response.__dict__}") + return response @app.get(path='/unique_identifier', response_model=str) async def unique_identifier(): - return Response(content=workspace.mets.unique_identifier, media_type='text/plain') + response = Response(content=workspace.mets.unique_identifier, media_type='text/plain') + self.log.info(f"GET /unique_identifier -> {response.__dict__}") + return response @app.get(path='/workspace_path', response_model=str) async def workspace_path(): - return Response(content=workspace.directory, media_type="text/plain") + response = Response(content=workspace.directory, media_type="text/plain") + self.log.info(f"GET /workspace_path -> {response.__dict__}") + return response @app.get(path='/physical_pages', response_model=OcrdPageListModel) async def physical_pages(): - return {'physical_pages': workspace.mets.physical_pages} + response = {'physical_pages': workspace.mets.physical_pages} + self.log.info(f"GET /physical_pages -> {response.__dict__}") + return response @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): - return {'file_groups': workspace.mets.file_groups} + response = {'file_groups': workspace.mets.file_groups} + self.log.info(f"GET /file_groups -> {response.__dict__}") + return response @app.get(path='/agent', response_model=OcrdAgentListModel) async def agents(): - return OcrdAgentListModel.create(workspace.mets.agents) + response = OcrdAgentListModel.create(workspace.mets.agents) + self.log.info(f"GET /agent -> {response.__dict__}") + return response @app.post(path='/agent', response_model=OcrdAgentModel) async def add_agent(agent: OcrdAgentModel): kwargs = agent.dict() kwargs['_type'] = kwargs.pop('type') workspace.mets.add_agent(**kwargs) - return agent + response = agent + self.log.info(f"POST /agent -> {response.__dict__}") + return response @app.get(path="/file", response_model=OcrdFileListModel) async def find_files( @@ -533,7 +551,9 @@ async def find_files( found = workspace.mets.find_all_files( fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, local_filename=local_filename, url=url ) - return OcrdFileListModel.create(found) + response = OcrdFileListModel.create(found) + self.log.info(f"GET /file -> {response.__dict__}") + return response @app.post(path='/file', response_model=OcrdFileModel) async def add_file( @@ -556,7 +576,9 @@ async def add_file( # Add to workspace kwargs = file_resource.dict() workspace.add_file(**kwargs, force=force) - return file_resource + response = file_resource + self.log.info(f"POST /file -> {response.__dict__}") + return response # ------------- # @@ -581,7 +603,7 @@ async def add_file( uvicorn_kwargs['log_config'] = None uvicorn_kwargs['access_log'] = False - self.log.debug("Starting uvicorn") + self.log.info("Starting the uvicorn Mets Server") uvicorn.run(app, **uvicorn_kwargs) # TODO: Not required after #1284, consider removing From 53c8f3f5ed2f3acb4d63eee01c2570801a8178ee Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:44:07 +0200 Subject: [PATCH 085/119] change cache debug -> info for extensive logging to file --- src/ocrd_network/server_cache.py | 45 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index b57f3fd235..78e53bd238 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -31,7 +31,7 @@ def check_if_locked_pages_for_output_file_grps( self, workspace_key: str, output_file_grps: List[str], page_ids: List[str] ) -> bool: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return False debug_message = f"Caching the received request due to locked output file grp pages." for file_group in output_file_grps: @@ -46,46 +46,45 @@ def check_if_locked_pages_for_output_file_grps( def get_locked_pages(self, workspace_key: str) -> Dict[str, List[str]]: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No locked pages available for workspace key: {workspace_key}") + self.log.info(f"No locked pages available for workspace key: {workspace_key}") return {} return self.locked_pages[workspace_key] def lock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") - self.log.debug(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") self.locked_pages[workspace_key] = {} for file_group in output_file_grps: if file_group not in self.locked_pages[workspace_key]: - self.log.debug(f"Creating an empty list for output file grp: {file_group}") + self.log.info(f"Creating an empty list for output file grp: {file_group}") self.locked_pages[workspace_key][file_group] = [] # The page id list is not empty - only some pages are in the request if page_ids: - self.log.debug(f"Locking pages for '{file_group}': {page_ids}") + self.log.info(f"Locking pages for '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group].extend(page_ids) - self.log.debug(f"Locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Locked pages of '{file_group}': {self.locked_pages[workspace_key][file_group]}") else: # Lock all pages with a single value - self.log.debug(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") + self.log.info(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") self.locked_pages[workspace_key][file_group].append(self.placeholder_all_pages) def unlock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return for file_group in output_file_grps: if file_group in self.locked_pages[workspace_key]: if page_ids: # Unlock the previously locked pages - self.log.debug(f"Unlocking pages of '{file_group}': {page_ids}") + self.log.info(f"Unlocking pages of '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group] = \ [x for x in self.locked_pages[workspace_key][file_group] if x not in page_ids] - self.log.debug(f"Remaining locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Remaining locked pages of '{file_group}': " + f"{self.locked_pages[workspace_key][file_group]}") else: # Remove the single variable used to indicate all pages are locked - self.log.debug(f"Unlocking all pages for: {file_group}") + self.log.info(f"Unlocking all pages for: {file_group}") self.locked_pages[workspace_key][file_group].remove(self.placeholder_all_pages) @@ -127,11 +126,11 @@ def __print_job_input_debug_message(self, job_input: PYJobInput): debug_message += f", page ids: {job_input.page_id}" debug_message += f", job id: {job_input.job_id}" debug_message += f", job depends on: {job_input.depends_on}" - self.log.debug(debug_message) + self.log.info(debug_message) async def consume_cached_requests(self, workspace_key: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be consumed for workspace key: {workspace_key}") + self.log.info(f"No jobs to be consumed for workspace key: {workspace_key}") return [] found_consume_requests = [] for current_element in self.processing_requests[workspace_key]: @@ -165,7 +164,7 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: # If a record counter of this workspace key does not exist # in the requests counter cache yet, create one and assign 0 if not self.processing_counter.get(workspace_key, None): - self.log.debug(f"Creating an internal request counter for workspace key: {workspace_key}") + self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}") self.processing_counter[workspace_key] = 0 self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value return self.processing_counter[workspace_key] @@ -173,7 +172,7 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: def cache_request(self, workspace_key: str, data: PYJobInput): # If a record queue of this workspace key does not exist in the requests cache if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"Creating an internal request queue for workspace_key: {workspace_key}") + self.log.info(f"Creating an internal request queue for workspace_key: {workspace_key}") self.processing_requests[workspace_key] = [] self.__print_job_input_debug_message(job_input=data) # Add the processing request to the end of the internal queue @@ -181,9 +180,9 @@ def cache_request(self, workspace_key: str, data: PYJobInput): async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be cancelled for workspace key: {workspace_key}") + self.log.info(f"No jobs to be cancelled for workspace key: {workspace_key}") return [] - self.log.debug(f"Cancelling jobs dependent on job id: {processing_job_id}") + self.log.info(f"Cancelling jobs dependent on job id: {processing_job_id}") found_cancel_requests = [] for i, current_element in enumerate(self.processing_requests[workspace_key]): if processing_job_id in current_element.depends_on: @@ -192,7 +191,7 @@ async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str for cancel_element in found_cancel_requests: try: self.processing_requests[workspace_key].remove(cancel_element) - self.log.debug(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") + self.log.info(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") cancelled_jobs.append(cancel_element) await db_update_processing_job(job_id=cancel_element.job_id, state=JobState.cancelled) # Recursively cancel dependent jobs for the cancelled job @@ -225,9 +224,9 @@ async def sync_is_caching_required(self, job_dependencies: List[str]) -> bool: def has_workspace_cached_requests(self, workspace_key: str) -> bool: if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"In processing requests cache, no workspace key found: {workspace_key}") + self.log.info(f"In processing requests cache, no workspace key found: {workspace_key}") return False if not len(self.processing_requests[workspace_key]): - self.log.debug(f"The processing requests cache is empty for workspace key: {workspace_key}") + self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}") return False return True From fe41223efe29bfeb6bb7e58d3c69db4e14a6f248 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:48:25 +0200 Subject: [PATCH 086/119] set log from info to debug --- src/ocrd_network/runtime_data/deployer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 7aec568071..aa7ff5eb05 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -162,8 +162,8 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_with_pid: bool = False) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") self.log.info(f"Path to the mets file: {path_to_mets}") - self.log.info(f"mets_server: {self.mets_servers}") - self.log.info(f"mets_server_paths: {self.mets_servers_paths}") + self.log.debug(f"mets_server: {self.mets_servers}") + self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") if stop_with_pid: mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] if Path(mets_server_url_uds) not in self.mets_servers: From 55c2f6357f1b83508b3e2eb305bdb9e65afb4fa2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:56:36 +0200 Subject: [PATCH 087/119] fix: typo --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index c6448b1d81..d2e0bb51e0 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -484,7 +484,7 @@ async def stop(): workspace.save_mets() response = Response(content="The Mets Server will shut down soon...", media_type='text/plain') self.shutdown() - self.log.info(f"POST /reload -> {response.__dict__}") + self.log.info(f"DELETE / -> {response.__dict__}") return response @app.post(path='/reload') From bf6616f1821e33fcda2376338d7419f8fba73a04 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 23:04:53 +0200 Subject: [PATCH 088/119] improve: delete socket file more appropriately --- src/ocrd/mets_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d2e0bb51e0..57db0e4653 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -443,6 +443,10 @@ def shutdown(self): pid = os.getpid() self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") os.kill(pid, signal.SIGTERM) + if self.is_uds: + if Path(self.url).exists(): + self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") + Path(self.url).unlink() def startup(self): self.log.info(f"Configuring up the Mets Server") From bc8a03bd8f8771790d14e51ec054d70b476454f6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 23:07:54 +0200 Subject: [PATCH 089/119] remove: unnecessary code --- src/ocrd_network/utils.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 3dfa71e5f3..5abe2104fd 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -167,17 +167,8 @@ def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> return response.status_code == 200 elif protocol == "uds": logger.info(f"Sending DELETE request to: {mets_server_url}/") - try: - response = Session_UDS().delete(url=f"{mets_server_url}/") - return response.status_code == 200 - finally: - if protocol == "uds": - ws_socket_file = str(get_uds_path(ws_dir_path)) - if Path(ws_socket_file).exists(): - logger.info(f"Removing the inactive UDS file: {ws_socket_file}") - Path(ws_socket_file).unlink() - else: - logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 else: ValueError(f"Unexpected protocol type: {protocol}") From 303488a5aa6d698f844e66107cb393be29ff1c14 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 23:17:02 +0200 Subject: [PATCH 090/119] fix: .__dict__ of {} --- src/ocrd/mets_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 57db0e4653..b442e03bc1 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -516,13 +516,13 @@ async def workspace_path(): @app.get(path='/physical_pages', response_model=OcrdPageListModel) async def physical_pages(): response = {'physical_pages': workspace.mets.physical_pages} - self.log.info(f"GET /physical_pages -> {response.__dict__}") + self.log.info(f"GET /physical_pages -> {response}") return response @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): response = {'file_groups': workspace.mets.file_groups} - self.log.info(f"GET /file_groups -> {response.__dict__}") + self.log.info(f"GET /file_groups -> {response}") return response @app.get(path='/agent', response_model=OcrdAgentListModel) From c8e0c731f9180bd7f9b939c21b6cb856a655cd3a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:23:49 +0200 Subject: [PATCH 091/119] Update src/ocrd/mets_server.py Co-authored-by: Konstantin Baierer --- src/ocrd/mets_server.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b442e03bc1..d7b416af66 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -438,8 +438,6 @@ def kill_process(mets_server_pid: int): pass def shutdown(self): - # os._exit because uvicorn catches SystemExit raised by sys.exit - # _exit(0) pid = os.getpid() self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") os.kill(pid, signal.SIGTERM) From 2cd4a64adc7103a1a996f686a01bcae23ccdd343 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:24:08 +0200 Subject: [PATCH 092/119] Update src/ocrd/mets_server.py Co-authored-by: Konstantin Baierer --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d7b416af66..261b695a14 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -447,7 +447,7 @@ def shutdown(self): Path(self.url).unlink() def startup(self): - self.log.info(f"Configuring up the Mets Server") + self.log.info(f"Configuring the Mets Server") workspace = self.workspace From 44a8cebfb91de97fc4bc9ea9910ae7ba01243e5c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:24:26 +0200 Subject: [PATCH 093/119] Update src/ocrd/mets_server.py Co-authored-by: Konstantin Baierer --- src/ocrd/mets_server.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 261b695a14..e45f48cef3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -588,12 +588,6 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - # TODO: Not required after #1284, consider removing - """ - if Path(self.url).exists() and not is_socket_in_use(self.url): - # remove leftover unused socket which blocks startup - Path(self.url).unlink() - """ server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() From 61c683f4c24330ae0397ad4baa7e21066473c9cb Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:24:37 +0200 Subject: [PATCH 094/119] Update src/ocrd_network/runtime_data/deployer.py Co-authored-by: Konstantin Baierer --- src/ocrd_network/runtime_data/deployer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index aa7ff5eb05..57b6d90819 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -191,5 +191,5 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit p.wait() self.log.info(f"Terminated mets server with pid: {mets_server_pid}") else: - self.log.info(f"Mets server has already terminated with pid: {mets_server_pid}") + self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.") return From 50553093180ac6b641273c08d559bbadf5e9b1d2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:25:41 +0200 Subject: [PATCH 095/119] remove unnecessary method --- src/ocrd/mets_server.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index e45f48cef3..9fb39861e3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -601,14 +601,3 @@ async def add_file( self.log.info("Starting the uvicorn Mets Server") uvicorn.run(app, **uvicorn_kwargs) - -# TODO: Not required after #1284, consider removing -def is_socket_in_use(socket_path): - if Path(socket_path).exists(): - client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - try: - client.connect(socket_path) - except OSError: - return False - client.close() - return True From 34bfbf432d042fbdbc676aff233ed708cbcdab62 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 14:40:58 +0200 Subject: [PATCH 096/119] fix: make stop() and ..reload..() sync --- src/ocrd/mets_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 9fb39861e3..774560a197 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -479,7 +479,7 @@ def save(): return response @app.delete(path='/') - async def stop(): + def stop(): """ Stop the mets server """ @@ -490,7 +490,7 @@ async def stop(): return response @app.post(path='/reload') - async def workspace_reload_mets(): + def workspace_reload_mets(): """ Reload mets file from the file system """ From ab660fbd0ff771c3e21af38185db331d2bf4121d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 15:11:29 +0200 Subject: [PATCH 097/119] fix: stop mets server when no cached requests --- src/ocrd_network/processing_server.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 59243d52fe..0431cf21f0 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -585,18 +585,13 @@ async def _cancel_cached_dependent_jobs(self, workspace_key: str, job_id: str) - async def _consume_cached_jobs_of_workspace( self, workspace_key: str, mets_server_url: str, path_to_mets: str ) -> List[PYJobInput]: - - # Check whether the internal queue for the workspace key still exists - if workspace_key not in self.cache_processing_requests.processing_requests: - self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") - return [] - # decrease the internal cache counter by 1 request_counter = self.cache_processing_requests.update_request_counter( workspace_key=workspace_key, by_value=-1 ) self.log.debug(f"Internal processing job cache counter value: {request_counter}") - if not len(self.cache_processing_requests.processing_requests[workspace_key]): + if (workspace_key not in self.cache_processing_requests.processing_requests or + not len(self.cache_processing_requests.processing_requests[workspace_key])): if request_counter <= 0: # Shut down the Mets Server for the workspace_key since no # more internal callbacks are expected for that workspace @@ -617,6 +612,10 @@ async def _consume_cached_jobs_of_workspace( else: self.log.debug(f"Internal request cache is empty but waiting for {request_counter} result callbacks.") return [] + # Check whether the internal queue for the workspace key still exists + if workspace_key not in self.cache_processing_requests.processing_requests: + self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") + return [] consumed_requests = await self.cache_processing_requests.consume_cached_requests(workspace_key=workspace_key) return consumed_requests From 148f8d42d2910547fd8397b5b2cfbab7e80853b8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 15:55:39 +0200 Subject: [PATCH 098/119] clean: remove pid kill flag in stop mets server --- src/ocrd_network/runtime_data/deployer.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 57b6d90819..2a01c2231b 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -159,25 +159,11 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url - def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_with_pid: bool = False) -> None: + def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") self.log.info(f"Path to the mets file: {path_to_mets}") self.log.debug(f"mets_server: {self.mets_servers}") self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") - if stop_with_pid: - mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] - if Path(mets_server_url_uds) not in self.mets_servers: - message = f"UDS Mets server not found at URL: {mets_server_url_uds}, mets path: {path_to_mets}" - self.log.warning(message) - mets_server_pid = self.mets_servers[str(mets_server_url_uds)] - self.log.info(f"Killing mets server pid: {mets_server_pid} of {mets_server_url_uds}") - OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) - self.log.info(f"Returning after the kill process") - if Path(mets_server_url_uds).exists(): - self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url_uds}") - Path(mets_server_url_uds).unlink() - self.log.info(f"Returning from the stop_uds_mets_server") - return # TODO: Reconsider this again # Not having this sleep here causes connection errors # on the last request processed by the processing worker. From dacd32517b7b2cdbc0417dffcab84b5d8710635c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 16:29:31 +0200 Subject: [PATCH 099/119] extend log: server cache requests --- src/ocrd_network/server_cache.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index 78e53bd238..179a76139d 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -167,6 +167,7 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}") self.processing_counter[workspace_key] = 0 self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value + self.log.info(f"The new request counter of {workspace_key}: {self.processing_counter[workspace_key]}") return self.processing_counter[workspace_key] def cache_request(self, workspace_key: str, data: PYJobInput): @@ -176,6 +177,7 @@ def cache_request(self, workspace_key: str, data: PYJobInput): self.processing_requests[workspace_key] = [] self.__print_job_input_debug_message(job_input=data) # Add the processing request to the end of the internal queue + self.log.info(f"Caching a processing request of {workspace_key}: {data.job_id}") self.processing_requests[workspace_key].append(data) async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]: @@ -229,4 +231,6 @@ def has_workspace_cached_requests(self, workspace_key: str) -> bool: if not len(self.processing_requests[workspace_key]): self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}") return False + self.log.info(f"The processing requests cache has {len(self.processing_requests[workspace_key])} " + f"entries for workspace key: {workspace_key} ") return True From 05ded73dcff81aada33be32a0db976c33d0a84d1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 16:39:14 +0200 Subject: [PATCH 100/119] improve: sleep no longer needed --- src/ocrd_network/runtime_data/deployer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 2a01c2231b..c35d94166b 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -164,11 +164,6 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None: self.log.info(f"Path to the mets file: {path_to_mets}") self.log.debug(f"mets_server: {self.mets_servers}") self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") - # TODO: Reconsider this again - # Not having this sleep here causes connection errors - # on the last request processed by the processing worker. - # Sometimes 3 seconds is enough, sometimes not. - sleep(5) mets_server_pid = self.mets_servers[str(self.mets_servers_paths[str(Path(path_to_mets).parent)])] self.log.info(f"Terminating mets server with pid: {mets_server_pid}") p = psutil.Process(mets_server_pid) From 5d755a8fb7b77d94a052f10e73c7a98ecb098a0d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 9 Oct 2024 09:17:41 +0200 Subject: [PATCH 101/119] add new env: OCRD_NETWORK_RABBITMQ_HEARTBEAT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd_network/rabbitmq_utils/connector.py | 4 ++-- src/ocrd_utils/config.py | 16 ++++++++++++++-- tests/network/config.py | 14 ++++++++++++-- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 70d738f083..863b9af0d7 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -47,6 +47,8 @@ \b {config.describe('OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS')} \b +{config.describe('OCRD_NETWORK_RABBITMQ_HEARTBEAT')} +\b {config.describe('OCRD_PROFILE_FILE')} \b {config.describe('OCRD_PROFILE', wrap_text=False)} diff --git a/src/ocrd_network/rabbitmq_utils/connector.py b/src/ocrd_network/rabbitmq_utils/connector.py index 893d55a219..8fbbc84ab9 100644 --- a/src/ocrd_network/rabbitmq_utils/connector.py +++ b/src/ocrd_network/rabbitmq_utils/connector.py @@ -6,6 +6,7 @@ from typing import Any, Optional, Union from pika import BasicProperties, BlockingConnection, ConnectionParameters, PlainCredentials from pika.adapters.blocking_connection import BlockingChannel +from ocrd_utils import config from .constants import ( DEFAULT_EXCHANGER_NAME, DEFAULT_EXCHANGER_TYPE, @@ -69,8 +70,7 @@ def open_blocking_connection( port=port, virtual_host=vhost, credentials=credentials, - # TODO: The heartbeat should not be disabled (0)! - heartbeat=0 + heartbeat=config.OCRD_NETWORK_RABBITMQ_HEARTBEAT ), ) return blocking_connection diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index d2cc4efce1..86f3200dd0 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -176,9 +176,21 @@ def _ocrd_download_timeout_parser(val): default=(True, '')) config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", - description="Number of attempts for a RabbitMQ client to connect before failing.", + description="Number of attempts for a RabbitMQ client to connect before failing.", + parser=int, + default=(True, 3)) + +config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", + description=""" + Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable + is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its + arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. + """, parser=int, - default=(True, 3)) + default=(True, 0) +) config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR", description="The root directory where all mets server related socket files are created", diff --git a/tests/network/config.py b/tests/network/config.py index e22cc6ce9d..c316202f1c 100644 --- a/tests/network/config.py +++ b/tests/network/config.py @@ -89,11 +89,21 @@ test_config.add( name="OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", + description="Number of attempts for a RabbitMQ client to connect before failing", + parser=int, + default=(True, 3) +) + +test_config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Number of attempts for a RabbitMQ client to connect before failing + Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable + is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its + arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. """, parser=int, - default=(True, 3) + default=(True, 0) ) test_config.add( From a295b0c29d2951c4e5f1a0bb572fd060fc12a5a7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:32:57 +0200 Subject: [PATCH 102/119] deps-torch: also install torchvision --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b5cd2f276e..26524a9fa6 100644 --- a/Makefile +++ b/Makefile @@ -158,7 +158,7 @@ deps-tf2: fi deps-torch: - $(PIP) install -i https://download.pytorch.org/whl/cu118 torch + $(PIP) install -i https://download.pytorch.org/whl/cu118 torch torchvision # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: From c5c60fde3c3879a3572772843ba583af4b22065d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 9 Oct 2024 17:08:26 +0200 Subject: [PATCH 103/119] fix: empty -> text --- src/ocrd/mets_server.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 774560a197..f54d0672c6 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -157,13 +157,13 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.request("PUT", url=self.url) + return self.session.request("PUT", url=self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.save(self.ws_dir_path) - ) + ).json()["text"] def stop(self): """ @@ -171,14 +171,13 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.request("DELETE", self.url) - return + return self.session.request("DELETE", self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.stop(self.ws_dir_path) - ) + ).json()["text"] except ConnectionError: # Expected because we exit the process without returning pass @@ -348,12 +347,12 @@ def __args_wrapper( @staticmethod def save(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="PUT", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="PUT", response_type="text", request_url="", request_data={}) @staticmethod def stop(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="DELETE", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="DELETE", response_type="text", request_url="", request_data={}) @staticmethod def reload(ws_dir_path: str) -> Dict: From e1b97840a6a7d45b4d5b70501d04349ed975e612 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 9 Oct 2024 17:36:31 +0200 Subject: [PATCH 104/119] deployer: remove METS Server path and url from their resp. caches on stopping --- src/ocrd_network/runtime_data/deployer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 57b6d90819..eae0cd21d3 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -154,7 +154,7 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: "Removing to avoid any weird behavior before starting the server.") Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") - pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) + pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), ws_dir_path=str(ws_dir_path), log_file=str(log_file)) self.mets_servers[str(mets_server_url)] = pid self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url @@ -164,8 +164,9 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit self.log.info(f"Path to the mets file: {path_to_mets}") self.log.debug(f"mets_server: {self.mets_servers}") self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") + workspace_path = str(Path(path_to_mets).parent) + mets_server_url_uds = self.mets_servers_paths[workspace_path] if stop_with_pid: - mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] if Path(mets_server_url_uds) not in self.mets_servers: message = f"UDS Mets server not found at URL: {mets_server_url_uds}, mets path: {path_to_mets}" self.log.warning(message) @@ -176,6 +177,8 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit if Path(mets_server_url_uds).exists(): self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url_uds}") Path(mets_server_url_uds).unlink() + del self.mets_servers_paths[workspace_path] + del self.mets_servers[mets_server_url_uds] self.log.info(f"Returning from the stop_uds_mets_server") return # TODO: Reconsider this again @@ -183,13 +186,15 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) - mets_server_pid = self.mets_servers[str(self.mets_servers_paths[str(Path(path_to_mets).parent)])] + mets_server_pid = self.mets_servers[mets_server_url_uds] self.log.info(f"Terminating mets server with pid: {mets_server_pid}") p = psutil.Process(mets_server_pid) - stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=str(Path(path_to_mets).parent)) + stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=workspace_path) if p.is_running(): p.wait() self.log.info(f"Terminated mets server with pid: {mets_server_pid}") else: self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.") + del self.mets_servers_paths[workspace_path] + del self.mets_servers[mets_server_url_uds] return From d39c3d716917239f2db25550f0be3f5c48ae2768 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:15:43 +0200 Subject: [PATCH 105/119] kill_mets_server_zombies: actually return List[int] --- src/ocrd_network/server_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 773668f5b7..2560dbbb03 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -264,8 +264,8 @@ def kill_mets_server_zombies(minutes_ago=60) -> List[int]: continue cmdline = cmdline_file.read_text().replace('\x00', ' ') if re.match(cmdline_pat, cmdline): - pid = procdir.name + pid = int(procdir.name) ret.append(pid) print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) - os.kill(int(pid), signal.SIGTERM) + os.kill(pid, signal.SIGTERM) return ret From 7512bd68f1b2e06ad8a62603c222b10624988a7f Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:16:21 +0200 Subject: [PATCH 106/119] kill_mets_server_zombies: allow dry_run to test --- src/ocrd_network/processing_server.py | 6 +++--- src/ocrd_network/server_utils.py | 14 +++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 505e106ba2..336d04f0d9 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -1,7 +1,7 @@ from datetime import datetime from os import getpid from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from uvicorn import run as uvicorn_run from fastapi import APIRouter, FastAPI, File, HTTPException, Request, status, UploadFile @@ -826,8 +826,8 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response - async def kill_mets_server_zombies(self) -> List[int]: - pids_killed = kill_mets_server_zombies(minutes_ago=60) + async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run) return pids_killed async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 2560dbbb03..6e485f261f 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -4,7 +4,7 @@ from pathlib import Path from json import dumps, loads from urllib.parse import urljoin -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from time import time from fastapi import HTTPException, status, UploadFile @@ -249,7 +249,12 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago=60) -> List[int]: +def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[bool]) -> List[int]: + if minutes_ago == None: + minutes_ago = 90 + if dry_run == None: + dry_run = False + now = time() cmdline_pat = r'.*ocrd workspace -U.*server start $' ret = [] @@ -267,5 +272,8 @@ def kill_mets_server_zombies(minutes_ago=60) -> List[int]: pid = int(procdir.name) ret.append(pid) print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) - os.kill(pid, signal.SIGTERM) + if dry_run: + print(f'[dry_run is active] kill {pid}') + else: + os.kill(pid, signal.SIGTERM) return ret From e40ed798fe462468635161433ee4cb55574c9d5a Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:35:14 +0200 Subject: [PATCH 107/119] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34ec973570..80868a6eb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Added: + + - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 + - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 + ## [2.69.0] - 2024-09-30 Fixed: From 7f605591ac373664cc225634e22877797fcffb40 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 10 Oct 2024 12:41:57 +0200 Subject: [PATCH 108/119] Simplify description for OCRD_NETWORK_RABBITMQ_HEARTBEAT --- src/ocrd_utils/config.py | 6 ++---- tests/network/config.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 86f3200dd0..f191389799 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -183,10 +183,8 @@ def _ocrd_download_timeout_parser(val): config.add( name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value - proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable - is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its - arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. """, parser=int, default=(True, 0) diff --git a/tests/network/config.py b/tests/network/config.py index c316202f1c..611ad63821 100644 --- a/tests/network/config.py +++ b/tests/network/config.py @@ -97,10 +97,8 @@ test_config.add( name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value - proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable - is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its - arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. """, parser=int, default=(True, 0) From 02c6effb2626ac585760dee4fbaa998fbdb01df1 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:58:36 +0200 Subject: [PATCH 109/119] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80868a6eb5..fe8b5508d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Added: - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 +Fixed: + + - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 + ## [2.69.0] - 2024-09-30 Fixed: From 88707ca9a8646cddcc23d10e3eee5a9fcde38280 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 13:09:23 +0200 Subject: [PATCH 110/119] :memo: changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe8b5508d1..8b8e66fd0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,17 @@ Added: - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 + - No more zombie METS Server by properly shutting them down, #1284 + - `OCRD_NETWORK_RABBITMQ_HEARBEAT` to allow overriding the [heartbeat](https://pika.readthedocs.io/en/stable/examples/heartbeat_and_blocked_timeouts.html) behavior of RabbitMQ, #1285 + +Changed: + + - significantly more detailed logging for the METS Server and Processing Server, #1284 Fixed: - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 + - Processing Server: remove shut down METS servers from deployer's cache, #1287 ## [2.69.0] - 2024-09-30 From cb8d7874806b489deca04f2bbe7215bc82cbb974 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 01:03:46 +0000 Subject: [PATCH 111/119] CLI decorator: only import ocrd_network when needed --- src/ocrd/decorators/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 464bb67ed8..bc969b3279 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -13,7 +13,6 @@ redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker, ProcessorServer, AgentType from ..resolver import Resolver from ..processor.base import ResourceNotFoundError, run_processor @@ -23,8 +22,6 @@ from .ocrd_cli_options import ocrd_cli_options from .mets_find_options import mets_find_options -SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] - def ocrd_cli_wrap_processor( processorClass, @@ -66,11 +63,9 @@ def ocrd_cli_wrap_processor( list_resources=list_resources ) sys.exit() - if subcommand: + if subcommand or address or queue or database: # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - elif address or queue or database: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") initLogging() @@ -164,6 +159,11 @@ def goexit(): def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): """ """ + from ocrd_network import ProcessingWorker, ProcessorServer, AgentType + SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] + + if not subcommand: + raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") From 94e6d2c63351b55ee7c44bfa68ed6b36ef958ab8 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 13:10:08 +0200 Subject: [PATCH 112/119] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b8e66fd0c..6fc7bbb1aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Fixed: - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 - Processing Server: remove shut down METS servers from deployer's cache, #1287 + - typos, #1274 ## [2.69.0] - 2024-09-30 From e5cdbe930dc6e7d4e8873e838617c913d2ab1ed2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:22:05 +0200 Subject: [PATCH 113/119] deps-cuda: retry if micromamba is unresponsive --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9ad35e1c20..23a6b438a5 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ deps-cuda: CONDA_EXE ?= /usr/local/bin/conda deps-cuda: export CONDA_PREFIX ?= /conda deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])' deps-cuda: - curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + curl --retry 3 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba mv bin/micromamba $(CONDA_EXE) # Install Conda system-wide (for interactive / login shells) echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh From 80c0c6f7e7c5c5de8807fe641e4bb452c68cd501 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 15:17:58 +0200 Subject: [PATCH 114/119] :memo: changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fc7bbb1aa..d2912a303d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ Added: Changed: - significantly more detailed logging for the METS Server and Processing Server, #1284 + - Only import `ocrd_network` in src/ocrd/decorators/__init__.py once needed, #1289 + + Fixed: From 7b1d17296231a14fb160f9638c7d5da05217298d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:05:55 +0200 Subject: [PATCH 115/119] create PyPI CD --- .github/workflows/publish-pypi.yml | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/publish-pypi.yml diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 0000000000..9228685ffe --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,31 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build twine + pip install -r requirements.txt + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: make pypi pypi-workaround From 7750f3f04b99cc488ee1764c9939885c0ae84d14 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 16:14:42 +0200 Subject: [PATCH 116/119] :memo: changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2912a303d..dcf7bbfba5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,7 @@ Changed: - significantly more detailed logging for the METS Server and Processing Server, #1284 - Only import `ocrd_network` in src/ocrd/decorators/__init__.py once needed, #1289 - - + - Automate release via GitHub Actions, #1290 Fixed: From 012ccf6af1cbc7cd377c4564b436e71dceff2fa5 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 16:16:00 +0200 Subject: [PATCH 117/119] :package: v2.70.0 --- CHANGELOG.md | 3 +++ VERSION | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dcf7bbfba5..4b90a57a24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.70.0] - 2024-10-10 + Added: - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 @@ -2214,6 +2216,7 @@ Fixed Initial Release +[2.70.0]: ../../compare/v2.70.0..v2.69.0 [2.69.0]: ../../compare/v2.69.0..v2.68.0 [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 diff --git a/VERSION b/VERSION index a740b92f5e..38a7743781 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.69.0 +2.70.0 From a8e2c6488b819c08dc96092da04a103bc77b0593 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:38:39 +0200 Subject: [PATCH 118/119] deps-cuda: retry micro.mamba.pm even more --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 23a6b438a5..1708caa129 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ deps-cuda: CONDA_EXE ?= /usr/local/bin/conda deps-cuda: export CONDA_PREFIX ?= /conda deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])' deps-cuda: - curl --retry 3 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + curl --retry 6 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba mv bin/micromamba $(CONDA_EXE) # Install Conda system-wide (for interactive / login shells) echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh From 85bde1574293ea8b7ba29255fbb8e07312c28eb1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:41:26 +0200 Subject: [PATCH 119/119] PyPI: do not upload deprecated distribution aliases anymore --- .github/workflows/publish-pypi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml index 9228685ffe..e811c958ab 100644 --- a/.github/workflows/publish-pypi.yml +++ b/.github/workflows/publish-pypi.yml @@ -28,4 +28,4 @@ jobs: env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: make pypi pypi-workaround + run: make pypi