From a87e5dd5aa1b329307d1425b7ac38a9ea71ee972 Mon Sep 17 00:00:00 2001 From: Michael Wood Date: Tue, 21 Oct 2025 15:06:11 +0100 Subject: [PATCH] manage_entities_data: Implement publisher output This requires making certain fields optional while grantnav transitions from using publisher data tacked on to the funder object. This solves the problem of grantnav using a provenance file for publisher data and once grantnav is updated to use publisher.jsonl the issue of duplicate publishers from different datagetter runs caused by the tacking on of the publisher data to funders. Note database migration to add further common field into Entity class. --- .../commands/create_data_package.py | 4 +++ .../commands/manage_entities_data.py | 28 +++++++++++---- ..._publisher_non_primary_org_ids_and_more.py | 35 +++++++++++++++++++ datastore/db/models.py | 5 +-- 4 files changed, 62 insertions(+), 10 deletions(-) create mode 100644 datastore/db/migrations/0025_publisher_non_primary_org_ids_and_more.py diff --git a/datastore/db/management/commands/create_data_package.py b/datastore/db/management/commands/create_data_package.py index 57833eae..1cc48c8a 100644 --- a/datastore/db/management/commands/create_data_package.py +++ b/datastore/db/management/commands/create_data_package.py @@ -55,6 +55,7 @@ def handle(self, *args, **options): data_all_file = "%s/data_all.json" % options["dir"] recipients_file = "%s/recipients.jl" % options["dir"] funders_file = "%s/funders.jl" % options["dir"] + publishers_file = "%s/publishers.jl" % options["dir"] with open(funders_file, "w") as funders_fp: create_orgs_list("funder", funders_fp) @@ -62,6 +63,9 @@ def handle(self, *args, **options): with open(recipients_file, "w") as recipients_fp: create_orgs_list("recipient", recipients_fp) + with open(publishers_file, "w") as publishers_fp: + create_orgs_list("publisher", publishers_fp) + def flatten_grant(in_grant): """Add the additional_data inside grant object""" out_grant = {} diff --git a/datastore/db/management/commands/manage_entities_data.py b/datastore/db/management/commands/manage_entities_data.py index 3ab0c654..83821c96 100644 --- a/datastore/db/management/commands/manage_entities_data.py +++ b/datastore/db/management/commands/manage_entities_data.py @@ -87,19 +87,35 @@ def create_orgs_list(entity_type, output=sys.stdout): entity_type: publisher, recipient, funder output: io """ + + extra_select = "" + end_clause = "" + + if entity_type == "publisher": + extra_select = "db_publisher.prefix," + # Limit the publisher entities to only the latest ones from the datagetter + end_clause = "WHERE db_publisher.getter_run_id = (SELECT id FROM db_getterrun ORDER BY datetime desc LIMIT 1)" + + # TODO To be removed when GN switch over to publisher org data + if entity_type == "funder": + extra_select = """ + db_publisher.name as "publisherName", + db_publisher.prefix as "publisherPrefix", + """ + end_clause = "LEFT OUTER JOIN db_publisher on db_funder.org_id = db_publisher.org_id OR db_publisher.org_id = ANY(db_funder.non_primary_org_ids)" + query = f""" SELECT DISTINCT db_{entity_type}.org_id as "id", + {extra_select} db_{entity_type}.non_primary_org_ids as "non_primary_org_ids", db_{entity_type}.name as name, db_{entity_type}."aggregate" as "aggregate", db_{entity_type}.additional_data as "additionalData", - additional_data_orginfocache.data as "ftcData", - db_publisher.name as "publisherName", - db_publisher.prefix as "publisherPrefix" + additional_data_orginfocache.data as "ftcData" FROM db_{entity_type} LEFT OUTER JOIN additional_data_orginfocache on db_{entity_type}.org_id = additional_data_orginfocache.org_id - LEFT OUTER JOIN db_publisher on db_{entity_type}.org_id = db_publisher.org_id OR db_publisher.org_id = ANY(db_{entity_type}.non_primary_org_ids) + {end_clause} """ def parse_data_in_result(result, col_types): @@ -132,7 +148,7 @@ def add_arguments(self, parser): nargs="+", action="store", dest="entity_type", - help="The entity type to output. One of: recipient, funder", + help="The entity type to output. One of: recipient, funder or publisher", ) parser.add_argument( @@ -149,7 +165,7 @@ def handle(self, *args, **options): if options.get("entity_type"): for entity_type in options["entity_type"]: - if entity_type != "recipient" and entity_type != "funder": + if entity_type not in ["recipient", "funder", "publisher"]: raise CommandError(f"{entity_type} is an unknown entity type") create_orgs_list(entity_type) diff --git a/datastore/db/migrations/0025_publisher_non_primary_org_ids_and_more.py b/datastore/db/migrations/0025_publisher_non_primary_org_ids_and_more.py new file mode 100644 index 00000000..df357fa0 --- /dev/null +++ b/datastore/db/migrations/0025_publisher_non_primary_org_ids_and_more.py @@ -0,0 +1,35 @@ +# Generated by Django 4.2.21 on 2025-10-21 13:36 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("db", "0024_auto_20240610_1847"), + ] + + operations = [ + migrations.AddField( + model_name="publisher", + name="non_primary_org_ids", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.TextField(), default=list, size=None + ), + ), + migrations.AlterField( + model_name="funder", + name="non_primary_org_ids", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.TextField(), default=list, size=None + ), + ), + migrations.AlterField( + model_name="recipient", + name="non_primary_org_ids", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.TextField(), default=list, size=None + ), + ), + ] diff --git a/datastore/db/models.py b/datastore/db/models.py index 1af78ac5..a0c50686 100644 --- a/datastore/db/models.py +++ b/datastore/db/models.py @@ -252,6 +252,7 @@ class Meta: PUBLISHER = "PUBLISHER" SOURCES_CHOICES = [(GRANT, "Grant"), (PUBLISHER, "Publisher")] source = models.TextField(choices=SOURCES_CHOICES) + non_primary_org_ids = ArrayField(models.TextField(), default=list) def __str__(self): return "%s %s)" % (self.org_id, self.name) @@ -400,8 +401,6 @@ class Meta: Index(fields=["org_id", "name"]), ] - non_primary_org_ids = ArrayField(models.TextField()) - class Funder(Entity): class Meta: @@ -413,8 +412,6 @@ class Meta: Index(fields=["org_id", "name"]), ] - non_primary_org_ids = ArrayField(models.TextField()) - class Grant(models.Model): grant_id = models.CharField(max_length=300)