From c9f728bc4bfa941e3d0d59d86ae1523daf6d9608 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:03 -0500 Subject: [PATCH 01/25] add checksum URI values and methods --- .../edu/harvard/iq/dataverse/DataFile.java | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index 45604a5472b..8a08cd15029 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -109,18 +109,22 @@ public class DataFile extends DvObject implements Comparable { * The list of types should be limited to the list above in the technote * because the string gets passed into MessageDigest.getInstance() and you * can't just pass in any old string. + * + * The URIs are used in the OAI_ORE export. They are taken from the associated XML Digital Signature standards. */ public enum ChecksumType { - MD5("MD5"), - SHA1("SHA-1"), - SHA256("SHA-256"), - SHA512("SHA-512"); + MD5("MD5", "http://www.w3.org/2001/04/xmldsig-more#md5"), + SHA1("SHA-1", "http://www.w3.org/2000/09/xmldsig#sha1"), + SHA256("SHA-256", "http://www.w3.org/2001/04/xmlenc#sha256"), + SHA512("SHA-512", "http://www.w3.org/2001/04/xmlenc#sha512"); private final String text; + private final String uri; - private ChecksumType(final String text) { + private ChecksumType(final String text, final String uri) { this.text = text; + this.uri = uri; } public static ChecksumType fromString(String text) { @@ -131,13 +135,30 @@ public static ChecksumType fromString(String text) { } } } - throw new IllegalArgumentException("ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + } + + public static ChecksumType fromUri(String uri) { + if (uri != null) { + for (ChecksumType checksumType : ChecksumType.values()) { + if (uri.equals(checksumType.uri)) { + return checksumType; + } + } + } + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); } @Override public String toString() { return text; } + + public String toUri() { + return uri; + } } //@Expose From a25e47b12cdd4fcb0050a69f0119e9abf4c59183 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:24 -0500 Subject: [PATCH 02/25] update version and use checksum URIs --- src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 4cbc2aa7b9a..aa011e2c70a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -49,7 +49,7 @@ public class OREMap { public static final String NAME = "OREMap"; //NOTE: Update this value whenever the output of this class is changed - private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.1"; + private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.2"; //v1.0.1 - added versionNote private static final String DATAVERSE_SOFTWARE_NAME = "Dataverse"; private static final String DATAVERSE_SOFTWARE_URL = "https://github.com/iqss/dataverse"; @@ -280,7 +280,7 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { JsonObject checksum = null; // Add checksum. RDA recommends SHA-512 if (df.getChecksumType() != null && df.getChecksumValue() != null) { - checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toString()) + checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toUri()) .add("@value", df.getChecksumValue()).build(); aggRes.add(JsonLDTerm.checksum.getLabel(), checksum); } From 6c0cb49513f7748cf6cf026d0b9892005820fbb5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:48 -0500 Subject: [PATCH 03/25] handle multiline descriptions and org names --- .../iq/dataverse/util/bagit/BagGenerator.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..69e9c686133 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -548,7 +548,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( + ChecksumType childHashType = ChecksumType.fromUri( child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { //If one wasn't set as a default, pick up what the first child with one uses @@ -828,7 +828,7 @@ private String generateInfoFile() { // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + WordUtils.wrap(orgAddress, 78, CRLF + " ", true)); + info.append("Organization-Address: " + multilineWrap(orgAddress)); info.append(CRLF); @@ -846,10 +846,8 @@ private String generateInfoFile() { if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append( - // FixMe - handle description having subfields better - WordUtils.wrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()), 78, CRLF + " ", true)); + info.append(multilineWrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), + descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -883,6 +881,20 @@ private String generateInfoFile() { } + private String multilineWrap(String value) { + // Normalize line breaks and ensure all lines after the first are indented + String[] lines =value.split("\\r?\\n"); + StringBuilder wrappedValue = new StringBuilder(); + for (int i = 0; i < lines.length; i++) { + String wrapped = WordUtils.wrap(lines[i].trim(), 78, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } + } + return wrappedValue.toString(); + } + /** * Kludge - compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. From 7a34db8078b4f1605968163bf839267bdd9e5d19 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:01:09 -0500 Subject: [PATCH 04/25] drop blank lines in multiline values Spec doesn't allow empty lines, dropping whitespace-only lines seems reasonable as well (users can't see from the Dataverse display whether an empty line would appear in bag-info.txt or not if we all whotespace only lines (or whitespace beyond the 78 char wrap limit) --- .../iq/dataverse/util/bagit/BagGenerator.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 69e9c686133..cf5bea08d99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -886,10 +886,15 @@ private String multilineWrap(String value) { String[] lines =value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { - String wrapped = WordUtils.wrap(lines[i].trim(), 78, CRLF + " ", true); - wrappedValue.append(wrapped); - if (i < lines.length - 1) { - wrappedValue.append(CRLF).append(" "); + // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, + // While trailing whitespace or whitespace-only lines appear to be allowed, it's not clear that handling them adds value (visually identical entries in Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt file + String line = lines[i].trim(); + if (line.length() > 0) { + String wrapped = WordUtils.wrap(line, 78, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } } } return wrappedValue.toString(); From b0daad7393a5663b5244ac89e04b0de9c630f9bf Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:02:01 -0500 Subject: [PATCH 05/25] remove title as a folder affects manifest and pid-mapping files as well as data file placement --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index cf5bea08d99..31ae06677c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -482,14 +482,6 @@ public static String getValidName(String bagName) { private void processContainer(JsonObject item, String currentPath) throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - } - logger.fine("Adding " + title + "/ to path " + currentPath); - currentPath = currentPath + title + "/"; int containerIndex = -1; try { createDir(currentPath); From e5457a8026f4e2e311b2ef84bea7d60f9f8020b4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:02:19 -0500 Subject: [PATCH 06/25] handle null deaccession reason --- src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index aa011e2c70a..426d5c9aa5f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -130,7 +130,8 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { if(vs.equals(VersionState.DEACCESSIONED)) { JsonObjectBuilder deaccBuilder = Json.createObjectBuilder(); deaccBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), vs.name()); - deaccBuilder.add(JsonLDTerm.DVCore("reason").getLabel(), version.getDeaccessionNote()); + // Reason is supposed to not be null, but historically this has not been enforced (in the API) + addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("reason"), version.getDeaccessionNote()); addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("forwardUrl"), version.getDeaccessionLink()); aggBuilder.add(JsonLDTerm.schemaOrg("creativeWorkStatus").getLabel(), deaccBuilder); From 10b0556e1de1c52a9a9cf9a32c9a3c07582ce60a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 10 Dec 2025 09:55:50 -0500 Subject: [PATCH 07/25] use static to simplify testing --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 31ae06677c3..4f3d0e00280 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -35,7 +35,6 @@ import java.util.logging.Logger; import java.util.zip.ZipEntry; -import edu.harvard.iq.dataverse.util.BundleUtil; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.archivers.zip.ParallelScatterZipCreator; import org.apache.commons.compress.archivers.zip.ScatterZipOutputStream; @@ -77,7 +76,6 @@ import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; -import java.util.Optional; public class BagGenerator { @@ -873,7 +871,7 @@ private String generateInfoFile() { } - private String multilineWrap(String value) { + static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented String[] lines =value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); From 6d241851d8860ddde6d6b1aac952c12ea426eb62 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 10 Dec 2025 13:49:17 -0500 Subject: [PATCH 08/25] Sanitize/split multiline catalog entry, add Dataverse-Bag-Version --- .../iq/dataverse/util/bagit/BagGenerator.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 4f3d0e00280..122ca0b6aba 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -77,6 +77,15 @@ import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +/** + * Creates an archival zipped Bag for long-term storage. It is intended to + * include all the information needed to reconstruct the dataset version in a + * new Dataverse instance. + * + * Note that the Dataverse-Bag-Version written in the generateInfoFile() method + * should be updated any time the content/structure of the bag is changed. + * + */ public class BagGenerator { private static final Logger logger = Logger.getLogger(BagGenerator.class.getCanonicalName()); @@ -864,9 +873,13 @@ private String generateInfoFile() { if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString()); + catalog=catalog.trim().replaceAll("[\\r\\n:]","_"); + info.append(catalog + ":" + multilineWrap(aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); + //Add a version number for our bag type - should be updated with any change to the bag content/structure + info.append("Dataverse-Bag-Version: 1.0"); + info.append(CRLF); return info.toString(); } From c4daf28099d4f91705edbe94efcaeecf229ff274 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:00:07 +0100 Subject: [PATCH 09/25] Added unit tests for multilineWrap --- .../bagit/BagGeneratorMultilineWrapTest.java | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java new file mode 100644 index 00000000000..39a713c14e4 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -0,0 +1,102 @@ +package edu.harvard.iq.dataverse.util.bagit; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Tests adapted for DD-2093: verify the behavior of BagGenerator.multilineWrap. + */ +public class BagGeneratorMultilineWrapTest { + + private static Method multilineWrap; + + @BeforeAll + static void setUp() throws NoSuchMethodException { + // Access the private static method via reflection + multilineWrap = BagGenerator.class.getDeclaredMethod("multilineWrap", String.class); + multilineWrap.setAccessible(true); + } + + private String callMultilineWrap(String input) { + try { + return (String) multilineWrap.invoke(null, input); + } catch (IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + } + + @Test + void shortLine_noWrap() { + String input = "Hello world"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo("Hello world"); + } + + @Test + void exactBoundary_78chars_noWrap() { + String input = repeat('a', 78); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(input); + } + + @Test + void longSingleWord_wrapsAt78WithIndent() { + String input = repeat('a', 100); + String expected = repeat('a', 78) + "\r\n " + repeat('a', 22); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_input_indentsSecondAndSubsequentOriginalLines() { + String input = "Line1\nLine2"; + String expected = "Line1\r\n Line2"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withCRLF_normalizedAndIndented() { + String input = "First line\r\nSecond line"; + String expected = "First line\r\n Second line"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void emptyLines_trimmedAndSkipped() { + String input = "Line1\n\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void whitespaceOnlyLines_ignored() { + String input = "Line1\n \n\t\t\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void longSecondLine_preservesIndentOnWraps() { + String line1 = "Header"; + String line2 = repeat('b', 90); + String input = line1 + "\n" + line2; + String expected = "Header\r\n " + repeat('b', 78) + "\r\n " + repeat('b', 12); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + private static String repeat(char c, int n) { + StringBuilder sb = new StringBuilder(n); + for (int i = 0; i < n; i++) sb.append(c); + return sb.toString(); + } +} From e76bc9135fabbbdd4cb79f8fea7ed98e518f57f8 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:09:00 +0100 Subject: [PATCH 10/25] Removed unnecessary repeat helper method --- .../bagit/BagGeneratorMultilineWrapTest.java | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 39a713c14e4..a212cac6316 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -39,15 +39,15 @@ void shortLine_noWrap() { @Test void exactBoundary_78chars_noWrap() { - String input = repeat('a', 78); + String input = "a".repeat(78); String out = callMultilineWrap(input); assertThat(out).isEqualTo(input); } @Test void longSingleWord_wrapsAt78WithIndent() { - String input = repeat('a', 100); - String expected = repeat('a', 78) + "\r\n " + repeat('a', 22); + String input = "a".repeat(100); + String expected = "a".repeat(78) + "\r\n " + "a".repeat(22); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -87,16 +87,10 @@ void whitespaceOnlyLines_ignored() { @Test void longSecondLine_preservesIndentOnWraps() { String line1 = "Header"; - String line2 = repeat('b', 90); + String line2 = "b".repeat(90); String input = line1 + "\n" + line2; - String expected = "Header\r\n " + repeat('b', 78) + "\r\n " + repeat('b', 12); + String expected = "Header\r\n " + "b".repeat(78) + "\r\n " + "b".repeat(12); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } - - private static String repeat(char c, int n) { - StringBuilder sb = new StringBuilder(n); - for (int i = 0; i < n; i++) sb.append(c); - return sb.toString(); - } } From 108c912ee037d23456650e6d5c49c5a943d5ef42 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:17:42 +0100 Subject: [PATCH 11/25] Alined test names with actual test being done --- .../util/bagit/BagGeneratorMultilineWrapTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index a212cac6316..71ceec61adf 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -54,15 +54,15 @@ void longSingleWord_wrapsAt78WithIndent() { @Test void multiline_input_indentsSecondAndSubsequentOriginalLines() { - String input = "Line1\nLine2"; - String expected = "Line1\r\n Line2"; + String input = "Line1\nLine2\nLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @Test - void multiline_withCRLF_normalizedAndIndented() { - String input = "First line\r\nSecond line"; + void multiline_withLF_normalizedAndIndented() { + String input = "First line\nSecond line"; String expected = "First line\r\n Second line"; String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); From 884b81b2f0f4aa951d38b18ce8f832643275c542 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 16 Dec 2025 09:25:50 -0500 Subject: [PATCH 12/25] DD-2098 - allow archivalstatus calls on deaccessioned versions --- src/main/java/edu/harvard/iq/dataverse/api/Datasets.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 2378388c540..12dd984775d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5006,7 +5006,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers); + headers, true); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5048,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers); + uriInfo, headers, true); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5095,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers); + headers, true); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } From 3076d69b2074326aee55d5d050b8c7628bdaee92 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 17 Dec 2025 15:36:16 -0500 Subject: [PATCH 13/25] set array properly --- .../java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 122ca0b6aba..473e2bab034 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -763,7 +763,6 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - JsonArray contactsArray = new JsonArray(); /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change * so we need to find the labels used. */ @@ -775,6 +774,7 @@ private String generateInfoFile() { JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); if (contacts.isJsonArray()) { + JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { info.append("Contact-Name: "); JsonElement person = contactsArray.get(i); From 1a7dafa9bb71412361890d519af21a9549b7f4da Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 09:59:26 -0500 Subject: [PATCH 14/25] DD-2212 - use configured checksum when no files are present --- .../iq/dataverse/util/bagit/BagGenerator.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 473e2bab034..b9de58dce90 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -75,7 +75,10 @@ import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; + +import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import jakarta.enterprise.inject.spi.CDI; /** * Creates an archival zipped Bag for long-term storage. It is intended to @@ -153,7 +156,6 @@ public class BagGenerator { public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { this.oremap = oreMap; this.oremapObject = oreMap.getOREMap(); - //(JsonObject) new JsonParser().parse(oreMap.getOREMap().toString()); this.dataciteXml = dataciteXml; try { @@ -189,10 +191,6 @@ public void setIgnoreHashes(boolean val) { ignorehashes = val; } - public void setDefaultCheckSumType(ChecksumType type) { - hashtype=type; - } - public static void println(String s) { System.out.println(s); System.out.flush(); @@ -278,6 +276,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { String path = sha1Entry.getKey(); sha1StringBuffer.append(sha1Entry.getValue() + " " + path); } + if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. + try { + //Use the current type if we can retrieve it + hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); + } catch (Exception e) { + // Default to MD5 if we can't + hashtype=DataFile.ChecksumType.MD5; + } + } if (!(hashtype == null)) { String manifestName = "manifest-"; if (hashtype.equals(DataFile.ChecksumType.SHA1)) { From 7eea57c648f462e58fe1d776dfa7fdcee6c3dc68 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 10:37:37 -0500 Subject: [PATCH 15/25] Revert "DD-2098 - allow archivalstatus calls on deaccessioned versions" This reverts commit 884b81b2f0f4aa951d38b18ce8f832643275c542. --- src/main/java/edu/harvard/iq/dataverse/api/Datasets.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 12dd984775d..2378388c540 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5006,7 +5006,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5048,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers, true); + uriInfo, headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5095,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } From 2477cf97a2232ca68f8702dcc3706d25fa7216ec Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 12:01:50 -0500 Subject: [PATCH 16/25] add Source-Org as a potential multiline case, remove change to Int Id --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b9de58dce90..e78d1f3edf7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -830,7 +830,7 @@ private String generateInfoFile() { String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + orgName); + info.append("Source-Organization: " + multilineWrap(orgName)); // ToDo - make configurable info.append(CRLF); @@ -880,8 +880,7 @@ private String generateInfoFile() { if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - catalog=catalog.trim().replaceAll("[\\r\\n:]","_"); - info.append(catalog + ":" + multilineWrap(aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + info.append(multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); //Add a version number for our bag type - should be updated with any change to the bag content/structure From 3f3908f7ccaed5c961b6bcce057b71f4208bc656 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 12:08:05 -0500 Subject: [PATCH 17/25] release note --- doc/release-notes/12063-ORE-and-Bag-updates.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 doc/release-notes/12063-ORE-and-Bag-updates.md diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md new file mode 100644 index 00000000000..e276232f33a --- /dev/null +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -0,0 +1,13 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation \ No newline at end of file From aa44c0895f4cba1dbc6b145b721f2d8b79406440 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 13:58:34 -0500 Subject: [PATCH 18/25] use constants, pass labelLength to wrapping, start custom lineWrap --- .../iq/dataverse/util/bagit/BagGenerator.java | 284 +++++++++++++----- 1 file changed, 205 insertions(+), 79 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index e78d1f3edf7..b253f961b8c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -33,6 +33,8 @@ import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.zip.ZipEntry; import org.apache.commons.codec.digest.DigestUtils; @@ -44,7 +46,6 @@ import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; -import org.apache.commons.text.WordUtils; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; @@ -137,6 +138,20 @@ public class BagGenerator { static PrintWriter pw = null; + // Bag-info.txt field labels + private static final String CONTACT_NAME = "Contact-Name: "; + private static final String CONTACT_EMAIL = "Contact-Email: "; + private static final String SOURCE_ORGANIZATION = "Source-Organization: "; + private static final String ORGANIZATION_ADDRESS = "Organization-Address: "; + private static final String ORGANIZATION_EMAIL = "Organization-Email: "; + private static final String EXTERNAL_DESCRIPTION = "External-Description: "; + private static final String BAGGING_DATE = "Bagging-Date: "; + private static final String EXTERNAL_IDENTIFIER = "External-Identifier: "; + private static final String BAG_SIZE = "Bag-Size: "; + private static final String PAYLOAD_OXUM = "Payload-Oxum: "; + private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; + private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -149,8 +164,9 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. - * @throws Exception - * @throws JsonSyntaxException + * + * @throws Exception + * @throws JsonSyntaxException */ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { @@ -159,8 +175,13 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio this.dataciteXml = dataciteXml; try { - // Using Dataverse, all the URLs to be retrieved should be on the current server, so allowing self-signed certs and not verifying hostnames are useful in testing and - // shouldn't be a significant security issue. This should not be allowed for arbitrary OREMap sources. + /* + * Using Dataverse, all the URLs to be retrieved should be on the current + * server, so allowing self-signed certs and not verifying hostnames are useful + * in testing and shouldn't be a significant security issue. This should not be + * allowed for arbitrary OREMap sources. + * + */ SSLContextBuilder builder = new SSLContextBuilder(); try { builder.loadTrustMaterial(null, new TrustSelfSignedStrategy()); @@ -168,10 +189,11 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), + NoopHostnameVerifier.INSTANCE); Registry registry = RegistryBuilder.create() - .register("http", PlainConnectionSocketFactory.getSocketFactory()) + .register("http", PlainConnectionSocketFactory.getSocketFactory()) .register("https", sslConnectionFactory).build(); cm = new PoolingHttpClientConnectionManager(registry); @@ -190,7 +212,7 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio public void setIgnoreHashes(boolean val) { ignorehashes = val; } - + public static void println(String s) { System.out.println(s); System.out.flush(); @@ -208,18 +230,18 @@ public static void println(String s) { * @return success true/false */ public boolean generateBag(OutputStream outputStream) throws Exception { - File tmp = File.createTempFile("qdr-scatter-dirs", "tmp"); dirs = ScatterZipOutputStream.fileBased(tmp); - // The oremapObject is javax.json.JsonObject and we need com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser().parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + // The oremapObject is javax.json.JsonObject and we need + // com.google.gson.JsonObject for the aggregation object + aggregation = (JsonObject) new JsonParser() + .parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); - String pidString=PidUtil.parseAsGlobalID(pidUrlString).asString(); - bagID = pidString + "v." - + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); - + String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); + bagID = pidString + "v." + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); + logger.info("Generating Bag: " + bagID); try { // Create valid filename from identifier and extend path with @@ -278,11 +300,11 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. try { - //Use the current type if we can retrieve it + // Use the current type if we can retrieve it hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); } catch (Exception e) { // Default to MD5 if we can't - hashtype=DataFile.ChecksumType.MD5; + hashtype = DataFile.ChecksumType.MD5; } } if (!(hashtype == null)) { @@ -300,7 +322,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } createFileFromString(manifestName, sha1StringBuffer.toString()); } else { - logger.warning("No Hash values (no files?) sending empty manifest to nominally comply with BagIT specification requirement"); + logger.warning("No Hash value defined sending empty manifest-md5 to nominally comply with BagIT specification requirement"); createFileFromString("manifest-md5.txt", ""); } // bagit.txt - Required by spec @@ -383,7 +405,7 @@ public boolean generateBag(String bagName, boolean temp) { // Create an output stream backed by the file bagFileOS = new FileOutputStream(bagFile); if (generateBag(bagFileOS)) { - //The generateBag call sets this.bagName to the correct value + // The generateBag call sets this.bagName to the correct value validateBagFile(bagFile); if (usetemp) { logger.fine("Moving tmp zip"); @@ -395,7 +417,7 @@ public boolean generateBag(String bagName, boolean temp) { return false; } } catch (Exception e) { - logger.log(Level.SEVERE,"Bag Exception: ", e); + logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; @@ -452,9 +474,9 @@ public void validateBag(String bagId) { logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { - logger.log(Level.SEVERE,"Could not validate Hashes", io); + logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { - logger.log(Level.SEVERE,"Could not validate Hashes", e); + logger.log(Level.SEVERE, "Could not validate Hashes", e); } finally { IOUtils.closeQuietly(zf); } @@ -479,7 +501,7 @@ public File getBagFile(String bagID) throws Exception { private void validateBagFile(File bagFile) throws IOException { // Run a confirmation test - should verify all files and hashes - + // Check files calculates the hashes and file sizes and reports on // whether hashes are correct checkFiles(checksumMap, bagFile); @@ -547,28 +569,27 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } String childPath = currentPath + childTitle; JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if(directoryLabel!=null) { - childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle; + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; } - String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromUri( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { - //If one wasn't set as a default, pick up what the first child with one uses + // If one wasn't set as a default, pick up what the first child with one uses hashtype = childHashType; } if (hashtype != null && !hashtype.equals(childHashType)) { logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); + + " hashes for " + childTitle); } else { childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); if (checksumMap.containsValue(childHash)) { // Something else has this hash logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); + + childHash + " in: " + bagID); } logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); checksumMap.put(childPath, childHash); @@ -736,7 +757,7 @@ private void checkFiles(HashMap shaMap, File bagFile) { } } catch (InterruptedException e) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); - } + } } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); @@ -770,39 +791,41 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. - */ + /* + * Contact, and it's subfields, are terms from citation.tsv whose mapping to a + * formal vocabulary and label in the oremap may change so we need to find the + * labels used. + */ JsonLDTerm contactTerm = oremap.getContactTerm(); if ((contactTerm != null) && aggregation.has(contactTerm.getLabel())) { JsonElement contacts = aggregation.get(contactTerm.getLabel()); JsonLDTerm contactNameTerm = oremap.getContactNameTerm(); JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); - + if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append("Contact-Name: "); + info.append(CONTACT_NAME); JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { info.append(person.getAsString()); info.append(CRLF); } else { - if(contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); + info.append(CRLF); } - if ((contactEmailTerm!=null) &&((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append("Contact-Email: "); + if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { + info.append(CONTACT_EMAIL); info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); info.append(CRLF); } } } } else { - info.append("Contact-Name: "); + info.append(CONTACT_NAME); if (contacts.isJsonPrimitive()) { info.append((String) contacts.getAsString()); @@ -810,12 +833,12 @@ private String generateInfoFile() { } else { JsonObject person = contacts.getAsJsonObject(); - if(contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(person.get(contactNameTerm.getLabel()).getAsString()); + info.append(CRLF); } - if ((contactEmailTerm!=null) && (person.has(contactEmailTerm.getLabel()))) { - info.append("Contact-Email: "); + if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { + info.append(CONTACT_EMAIL); info.append(person.get(contactEmailTerm.getLabel()).getAsString()); info.append(CRLF); } @@ -826,80 +849,92 @@ private String generateInfoFile() { logger.warning("No contact info available for BagIt Info file"); } - String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class).orElse("Dataverse Installation ()"); + String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class) + .orElse("Dataverse Installation ()"); String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + multilineWrap(orgName)); + info.append(SOURCE_ORGANIZATION + multilineWrap(orgName, SOURCE_ORGANIZATION.length())); // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + multilineWrap(orgAddress)); + info.append(ORGANIZATION_ADDRESS + multilineWrap(orgAddress, ORGANIZATION_ADDRESS.length())); info.append(CRLF); // Not a BagIt standard name - info.append("Organization-Email: " + orgEmail); + info.append(ORGANIZATION_EMAIL + multilineWrap(orgEmail, ORGANIZATION_EMAIL.length())); info.append(CRLF); - info.append("External-Description: "); - - /* Description, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. + info.append(EXTERNAL_DESCRIPTION); + + /* + * Description, and it's subfields, are terms from citation.tsv whose mapping to + * a formal vocabulary and label in the oremap may change so we need to find the + * labels used. */ JsonLDTerm descriptionTerm = oremap.getDescriptionTerm(); JsonLDTerm descriptionTextTerm = oremap.getDescriptionTextTerm(); if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append(multilineWrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()))); + info.append(multilineWrap( + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()), + EXTERNAL_DESCRIPTION.length())); info.append(CRLF); } - info.append("Bagging-Date: "); + info.append(BAGGING_DATE); info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append("External-Identifier: "); + info.append(EXTERNAL_IDENTIFIER); info.append(aggregation.get("@id").getAsString()); info.append(CRLF); - info.append("Bag-Size: "); + info.append(BAG_SIZE); info.append(byteCountToDisplaySize(totalDataSize)); info.append(CRLF); - info.append("Payload-Oxum: "); + info.append(PAYLOAD_OXUM); info.append(Long.toString(totalDataSize)); info.append("."); info.append(Long.toString(dataCount)); info.append(CRLF); - info.append("Internal-Sender-Identifier: "); + info.append(INTERNAL_SENDER_IDENTIFIER); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + info.append( + multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(), + INTERNAL_SENDER_IDENTIFIER.length())); info.append(CRLF); - //Add a version number for our bag type - should be updated with any change to the bag content/structure - info.append("Dataverse-Bag-Version: 1.0"); + // Add a version number for our bag type - should be updated with any change to + // the bag content/structure + info.append(DATAVERSE_BAG_VERSION + "1.0"); info.append(CRLF); return info.toString(); } - static private String multilineWrap(String value) { + static private String multilineWrap(String value, int labelLength) { // Normalize line breaks and ensure all lines after the first are indented - String[] lines =value.split("\\r?\\n"); + String[] lines = value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, - // While trailing whitespace or whitespace-only lines appear to be allowed, it's not clear that handling them adds value (visually identical entries in Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt file + // While trailing whitespace or whitespace-only lines appear to be allowed, it's + // not clear that handling them adds value (visually identical entries in + // Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt + // file String line = lines[i].trim(); if (line.length() > 0) { - String wrapped = WordUtils.wrap(line, 78, CRLF + " ", true); + // Recommended line length, including the label or indents is 79, so we'll wrap + // at 78 to assure subsequent lines with a space are still < 79 total + String wrapped = lineWrap(line, 79, CRLF + " ", true); wrappedValue.append(wrapped); if (i < lines.length - 1) { wrappedValue.append(CRLF).append(" "); @@ -909,25 +944,117 @@ static private String multilineWrap(String value) { return wrappedValue.toString(); } + public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { + if (str == null) { + return null; + } + if (newLineStr == null) { + newLineStr = System.lineSeparator(); + } + if (wrapLength < 1) { + wrapLength = 1; + } + String wrapOn = " "; + final Pattern patternToWrapOn = Pattern.compile(wrapOn); + final int inputLineLength = str.length(); + int offset = 0; + final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); + int matcherSize = -1; + + while (offset < inputLineLength) { + int spaceToWrapAt = -1; + Matcher matcher = patternToWrapOn.matcher(str.substring(offset, + Math.min((int) Math.min(Integer.MAX_VALUE, offset + wrapLength + 1L), inputLineLength))); + if (matcher.find()) { + if (matcher.start() == 0) { + matcherSize = matcher.end(); + if (matcherSize != 0) { + offset += matcher.end(); + continue; + } + offset += 1; + } + spaceToWrapAt = matcher.start() + offset; + } + + // only last line without leading spaces is left + if (inputLineLength - offset <= wrapLength) { + break; + } + + while (matcher.find()) { + spaceToWrapAt = matcher.start() + offset; + } + + if (spaceToWrapAt >= offset) { + // normal case + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + + } else // really long word or URL + if (wrapLongWords) { + if (matcherSize == 0) { + offset--; + } + // wrap really long word one line at a time + wrappedLine.append(str, offset, wrapLength + offset); + wrappedLine.append(newLineStr); + offset += wrapLength; + matcherSize = -1; + } else { + // do not wrap really long word, just extend beyond limit + matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + if (matcher.find()) { + matcherSize = matcher.end() - matcher.start(); + spaceToWrapAt = matcher.start() + offset + wrapLength; + } + + if (spaceToWrapAt >= 0) { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + } else { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, str.length()); + offset = inputLineLength; + matcherSize = -1; + } + } + } + + if (matcherSize == 0 && offset < inputLineLength) { + offset--; + } + + // Whatever is left in line is short enough to just pass through + wrappedLine.append(str, offset, str.length()); + + return wrappedLine.toString(); + } + /** * Kludge - compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. * - * @param jsonElement - * - the root json object - * @param key - * - the key to find a value(s) for + * @param jsonElement - the root json object + * @param key - the key to find a value(s) for * @return - a single string */ String getSingleValue(JsonElement jsonElement, String key) { String val = ""; - if(jsonElement.isJsonObject()) { - JsonObject jsonObject=jsonElement.getAsJsonObject(); + if (jsonElement.isJsonObject()) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); val = jsonObject.get(key).getAsString(); } else if (jsonElement.isJsonArray()) { - + Iterator iter = jsonElement.getAsJsonArray().iterator(); ArrayList stringArray = new ArrayList(); while (iter.hasNext()) { @@ -1127,8 +1254,7 @@ public InputStream get() { * Returns a human-readable version of the file size, where the input represents * a specific number of bytes. * - * @param size - * the number of bytes + * @param size the number of bytes * @return a human-readable display value (includes units) */ public static String byteCountToDisplaySize(long size) { From 8227edff5601ec95ea4f8f2851d630265f23cfd4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 14:28:19 -0500 Subject: [PATCH 19/25] update to handle overall 79 char length --- .../iq/dataverse/util/bagit/BagGenerator.java | 53 +++++++------ .../bagit/BagGeneratorMultilineWrapTest.java | 74 +++++++++++++++++-- 2 files changed, 101 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b253f961b8c..847bcc08141 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -854,20 +854,18 @@ private String generateInfoFile() { String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append(SOURCE_ORGANIZATION + multilineWrap(orgName, SOURCE_ORGANIZATION.length())); + info.append(multilineWrap(SOURCE_ORGANIZATION + orgName)); // ToDo - make configurable info.append(CRLF); - info.append(ORGANIZATION_ADDRESS + multilineWrap(orgAddress, ORGANIZATION_ADDRESS.length())); + info.append(multilineWrap(ORGANIZATION_ADDRESS + orgAddress)); info.append(CRLF); // Not a BagIt standard name - info.append(ORGANIZATION_EMAIL + multilineWrap(orgEmail, ORGANIZATION_EMAIL.length())); + info.append(multilineWrap(ORGANIZATION_EMAIL + orgEmail)); info.append(CRLF); - info.append(EXTERNAL_DESCRIPTION); - /* * Description, and it's subfields, are terms from citation.tsv whose mapping to * a formal vocabulary and label in the oremap may change so we need to find the @@ -878,9 +876,8 @@ private String generateInfoFile() { if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append(multilineWrap( - getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()), - EXTERNAL_DESCRIPTION.length())); + info.append(multilineWrap(EXTERNAL_DESCRIPTION + + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -902,14 +899,12 @@ private String generateInfoFile() { info.append(Long.toString(dataCount)); info.append(CRLF); - info.append(INTERNAL_SENDER_IDENTIFIER); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append( - multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(), - INTERNAL_SENDER_IDENTIFIER.length())); + info.append(multilineWrap(INTERNAL_SENDER_IDENTIFIER + catalog + ":" + + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); // Add a version number for our bag type - should be updated with any change to @@ -920,7 +915,7 @@ private String generateInfoFile() { } - static private String multilineWrap(String value, int labelLength) { + static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented String[] lines = value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); @@ -932,8 +927,7 @@ static private String multilineWrap(String value, int labelLength) { // file String line = lines[i].trim(); if (line.length() > 0) { - // Recommended line length, including the label or indents is 79, so we'll wrap - // at 78 to assure subsequent lines with a space are still < 79 total + // Recommended line length, including the label or indents is 79 String wrapped = lineWrap(line, 79, CRLF + " ", true); wrappedValue.append(wrapped); if (i < lines.length - 1) { @@ -944,6 +938,7 @@ static private String multilineWrap(String value, int labelLength) { return wrappedValue.toString(); } + /** Adapted from Apache WordUtils.wrap() - make subsequent lines shorter by the length of any spaces in newLineStr*/ public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { if (str == null) { return null; @@ -954,17 +949,30 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt if (wrapLength < 1) { wrapLength = 1; } + + // Calculate the indent length (characters after CRLF in newLineStr) + int indentLength = 0; + int crlfIndex = newLineStr.lastIndexOf("\n"); + if (crlfIndex != -1) { + indentLength = newLineStr.length() - crlfIndex -1; + } + String wrapOn = " "; final Pattern patternToWrapOn = Pattern.compile(wrapOn); final int inputLineLength = str.length(); int offset = 0; final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); int matcherSize = -1; + boolean isFirstLine = true; while (offset < inputLineLength) { + // Adjust wrap length based on whether this is the first line or subsequent + // lines + int currentWrapLength = isFirstLine ? wrapLength : (wrapLength - indentLength); + int spaceToWrapAt = -1; Matcher matcher = patternToWrapOn.matcher(str.substring(offset, - Math.min((int) Math.min(Integer.MAX_VALUE, offset + wrapLength + 1L), inputLineLength))); + Math.min((int) Math.min(Integer.MAX_VALUE, offset + currentWrapLength + 1L), inputLineLength))); if (matcher.find()) { if (matcher.start() == 0) { matcherSize = matcher.end(); @@ -978,7 +986,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt } // only last line without leading spaces is left - if (inputLineLength - offset <= wrapLength) { + if (inputLineLength - offset <= currentWrapLength) { break; } @@ -991,6 +999,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt wrappedLine.append(str, offset, spaceToWrapAt); wrappedLine.append(newLineStr); offset = spaceToWrapAt + 1; + isFirstLine = false; } else // really long word or URL if (wrapLongWords) { @@ -998,16 +1007,17 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt offset--; } // wrap really long word one line at a time - wrappedLine.append(str, offset, wrapLength + offset); + wrappedLine.append(str, offset, currentWrapLength + offset); wrappedLine.append(newLineStr); - offset += wrapLength; + offset += currentWrapLength; matcherSize = -1; + isFirstLine = false; } else { // do not wrap really long word, just extend beyond limit - matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + matcher = patternToWrapOn.matcher(str.substring(offset + currentWrapLength)); if (matcher.find()) { matcherSize = matcher.end() - matcher.start(); - spaceToWrapAt = matcher.start() + offset + wrapLength; + spaceToWrapAt = matcher.start() + offset + currentWrapLength; } if (spaceToWrapAt >= 0) { @@ -1017,6 +1027,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt wrappedLine.append(str, offset, spaceToWrapAt); wrappedLine.append(newLineStr); offset = spaceToWrapAt + 1; + isFirstLine = false; } else { if (matcherSize == 0 && offset != 0) { offset--; diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 71ceec61adf..19d478f4b0d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -1,3 +1,4 @@ + package edu.harvard.iq.dataverse.util.bagit; import static org.assertj.core.api.Assertions.assertThat; @@ -47,7 +48,7 @@ void exactBoundary_78chars_noWrap() { @Test void longSingleWord_wrapsAt78WithIndent() { String input = "a".repeat(100); - String expected = "a".repeat(78) + "\r\n " + "a".repeat(22); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -62,8 +63,8 @@ void multiline_input_indentsSecondAndSubsequentOriginalLines() { @Test void multiline_withLF_normalizedAndIndented() { - String input = "First line\nSecond line"; - String expected = "First line\r\n Second line"; + String input = "a".repeat(200); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(43); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -89,8 +90,71 @@ void longSecondLine_preservesIndentOnWraps() { String line1 = "Header"; String line2 = "b".repeat(90); String input = line1 + "\n" + line2; - String expected = "Header\r\n " + "b".repeat(78) + "\r\n " + "b".repeat(12); + String expected = "Header\r\n " + "b".repeat(79) + "\r\n " + "b".repeat(11); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_reducesFirstLineMaxLength() { + // With a label of length 20, first line should wrap at 78-20=58 chars + String label = "l".repeat(20); + String input = label + "a".repeat(150); + // First line: 58 chars, subsequent lines: 78 + String expected = label + "a".repeat(59) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(13); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_zero_behavesAsDefault() { + String input = "a".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_withMultipleLines_onlyAffectsFirstLine() { + String label = "l".repeat(15); + String input = label + "a".repeat(100) + "\nSecond line content"; + // First line wraps at 79-15=64, then continues at 78 per line + // Second line starts fresh and wraps normally + String expected = label + "a".repeat(64) + "\r\n " + "a".repeat(36) + "\r\n Second line content"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_notMidWord() { + // Create a string with a word boundary at position 75 + // "a" repeated 75 times, then a space, then more characters + String input = "a".repeat(75) + " " + "b".repeat(20); + // Should wrap at the space (position 75), not at position 79 + String expected = "a".repeat(75) + "\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_multipleSpaces() { + // Test with word boundary closer to the limit + String input = "a".repeat(70) + " word " + "b".repeat(20); + // Should wrap after "word" (at position 76) + String expected = "a".repeat(70) + " word\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_withLabelLength() { + String label = "l".repeat(20); + // With label length=20, first line wraps at 78-20=58 + // Create string with word boundary at position 55 + String input = label + "a".repeat(55) + " " + "b".repeat(30); + // Should wrap at the space (position 55) + String expected = label + "a".repeat(55) + "\r\n " + "b".repeat(30); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } -} +} \ No newline at end of file From d0749fcd39abefcf0ee13c6fcb042d235f6119dd Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 14:33:41 -0500 Subject: [PATCH 20/25] wrap any other potentially long values --- .../iq/dataverse/util/bagit/BagGenerator.java | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 847bcc08141..b4a80d4d9a9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -806,40 +806,36 @@ private String generateInfoFile() { if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append(CONTACT_NAME); + JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { - info.append(person.getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.getAsString())); info.append(CRLF); } else { if (contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_NAME + ((JsonObject) person).get(contactNameTerm.getLabel()).getAsString())); info.append(CRLF); } if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append(CONTACT_EMAIL); - info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_EMAIL + ((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } } } else { - info.append(CONTACT_NAME); - if (contacts.isJsonPrimitive()) { - info.append((String) contacts.getAsString()); + info.append(multilineWrap(CONTACT_NAME + (String) contacts.getAsString())); info.append(CRLF); } else { JsonObject person = contacts.getAsJsonObject(); if (contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.get(contactNameTerm.getLabel()).getAsString())); info.append(CRLF); } if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { - info.append(CONTACT_EMAIL); - info.append(person.get(contactEmailTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_EMAIL + person.get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } @@ -885,8 +881,7 @@ private String generateInfoFile() { info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append(EXTERNAL_IDENTIFIER); - info.append(aggregation.get("@id").getAsString()); + info.append(multilineWrap(EXTERNAL_IDENTIFIER + aggregation.get("@id").getAsString())); info.append(CRLF); info.append(BAG_SIZE); From 24a625f187ecb662b242d613e3fe8d48dd9a9e92 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 15:03:14 -0500 Subject: [PATCH 21/25] cleanup deprecated code, auto-gen comments --- .../iq/dataverse/util/bagit/BagGenerator.java | 50 +++++++------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b4a80d4d9a9..adca7dd40c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -235,8 +235,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { dirs = ScatterZipOutputStream.fileBased(tmp); // The oremapObject is javax.json.JsonObject and we need // com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser() - .parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + aggregation = (JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); @@ -394,7 +394,6 @@ public boolean generateBag(OutputStream outputStream) throws Exception { public boolean generateBag(String bagName, boolean temp) { usetemp = temp; - FileOutputStream bagFileOS = null; try { File origBagFile = getBagFile(bagName); File bagFile = origBagFile; @@ -403,36 +402,36 @@ public boolean generateBag(String bagName, boolean temp) { logger.fine("Writing to: " + bagFile.getAbsolutePath()); } // Create an output stream backed by the file - bagFileOS = new FileOutputStream(bagFile); - if (generateBag(bagFileOS)) { - // The generateBag call sets this.bagName to the correct value - validateBagFile(bagFile); - if (usetemp) { - logger.fine("Moving tmp zip"); - origBagFile.delete(); - bagFile.renameTo(origBagFile); + try (FileOutputStream bagFileOS = new FileOutputStream(bagFile)) { + if (generateBag(bagFileOS)) { + // The generateBag call sets this.bagName to the correct value + validateBagFile(bagFile); + if (usetemp) { + logger.fine("Moving tmp zip"); + origBagFile.delete(); + bagFile.renameTo(origBagFile); + } + return true; + } else { + return false; } - return true; - } else { - return false; } } catch (Exception e) { logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; - } finally { - IOUtils.closeQuietly(bagFileOS); } } + @SuppressWarnings("deprecation") public void validateBag(String bagId) { logger.info("Validating Bag"); ZipFile zf = null; InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = new ZipFile(bagFile); + zf = ZipFile.builder().setFile(bagFile).get(); ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { logger.info("SHA1 hashes used"); @@ -602,9 +601,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce try { if ((childHash == null) | ignorehashes) { // Generate missing hashInputStream inputStream = null; - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { if (hashtype != null) { if (hashtype.equals(DataFile.ChecksumType.SHA1)) { @@ -621,8 +618,6 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } catch (IOException e) { logger.severe("Failed to read " + childPath); throw e; - } finally { - IOUtils.closeQuietly(inputStream); } if (childHash != null) { JsonObject childHashObject = new JsonObject(); @@ -732,9 +727,7 @@ private void createFileFromURL(final String relPath, final String uri) private void checkFiles(HashMap shaMap, File bagFile) { ExecutorService executor = Executors.newFixedThreadPool(numConnections); - ZipFile zf = null; - try { - zf = new ZipFile(bagFile); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { BagValidationJob.setZipFile(zf); BagValidationJob.setBagGenerator(this); @@ -759,10 +752,7 @@ private void checkFiles(HashMap shaMap, File bagFile) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); } } catch (IOException e1) { - // TODO Auto-generated catch block e1.printStackTrace(); - } finally { - IOUtils.closeQuietly(zf); } logger.fine("Hash Validations Completed"); @@ -1153,10 +1143,8 @@ private HttpGet createNewGetRequest(URI url, String returnType) { urlString = urlString + ((urlString.indexOf('?') != -1) ? "&key=" : "?key=") + apiKey; request = new HttpGet(new URI(urlString)); } catch (MalformedURLException e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } } else { @@ -1211,7 +1199,6 @@ public InputStream get() { } } catch (ClientProtocolException e) { tries += 5; - // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // Retry if this is a potentially temporary error such @@ -1228,7 +1215,6 @@ public InputStream get() { } } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } logger.severe("Could not read: " + uriString); From bf036f3f85066a6a148af9fff3119d8156e63d0b Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 16:26:17 -0500 Subject: [PATCH 22/25] update comment --- .../java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index adca7dd40c3..3c82a9719d3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1035,7 +1035,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt } /** - * Kludge - compound values (e.g. for descriptions) are sent as an array of + * Compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. From be65611fb9578c96ed4a1aa28e730a693b85f437 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 16:26:54 -0500 Subject: [PATCH 23/25] add tests --- .../util/bagit/BagGeneratorInfoFileTest.java | 295 ++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java new file mode 100644 index 00000000000..dbbf3241318 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java @@ -0,0 +1,295 @@ + +package edu.harvard.iq.dataverse.util.bagit; + +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import com.google.gson.JsonParser; + +import jakarta.json.Json; +import jakarta.json.JsonArrayBuilder; +import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class BagGeneratorInfoFileTest { + + private BagGenerator bagGenerator; + private JsonObjectBuilder testAggregationBuilder; + + @Mock + private OREMap mockOreMap; + + @BeforeEach + public void setUp() throws Exception { + MockitoAnnotations.openMocks(this); + + // Create base test aggregation builder with required fields + testAggregationBuilder = Json.createObjectBuilder(); + testAggregationBuilder.add("@id", "doi:10.5072/FK2/TEST123"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), "Test Dataset"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel(), "Test Catalog"); + } + + /** + * Helper method to finalize the aggregation and create the BagGenerator + */ + private void initializeBagGenerator() throws Exception { + JsonObject testAggregation = testAggregationBuilder.build(); + + JsonObjectBuilder oremapJsonBuilder = Json.createObjectBuilder(); + oremapJsonBuilder.add(JsonLDTerm.ore("describes").getLabel(), testAggregation); + JsonObject oremapObject = oremapJsonBuilder.build(); + // Mock the OREMap.getOREMap() method to return the built JSON + when(mockOreMap.getOREMap()).thenReturn(oremapObject); + + // Initialize BagGenerator with test data + bagGenerator = new BagGenerator(mockOreMap, ""); + setPrivateField(bagGenerator, "aggregation", (com.google.gson.JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString())); + setPrivateField(bagGenerator, "totalDataSize", 1024000L); + setPrivateField(bagGenerator, "dataCount", 10L); + } + + @Test + public void testGenerateInfoFileWithSingleContact() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "John Doe"); + contactBuilder.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + } + + @Test + public void testGenerateInfoFileWithMultipleContacts() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonArrayBuilder contactsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder contact1 = Json.createObjectBuilder(); + contact1.add(contactNameTerm.getLabel(), "John Doe"); + contact1.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + + JsonObjectBuilder contact2 = Json.createObjectBuilder(); + contact2.add(contactNameTerm.getLabel(), "Jane Smith"); + contact2.add(contactEmailTerm.getLabel(), "jane.smith@example.com"); + + JsonObjectBuilder contact3 = Json.createObjectBuilder(); + contact3.add(contactNameTerm.getLabel(), "Bob Johnson"); + contact3.add(contactEmailTerm.getLabel(), "bob.johnson@example.com"); + + contactsBuilder.add(contact1); + contactsBuilder.add(contact2); + contactsBuilder.add(contact3); + + testAggregationBuilder.add(contactTerm.getLabel(), contactsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + assertTrue(infoFile.contains("Contact-Name: Jane Smith")); + assertTrue(infoFile.contains("Contact-Email: jane.smith@example.com")); + assertTrue(infoFile.contains("Contact-Name: Bob Johnson")); + assertTrue(infoFile.contains("Contact-Email: bob.johnson@example.com")); + } + + @Test + public void testGenerateInfoFileWithSingleDescription() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "This is a test dataset description."); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("External-Description: This is a test dataset description.")); + } + + @Test + public void testGenerateInfoFileWithMultipleDescriptions() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonArrayBuilder descriptionsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder desc1 = Json.createObjectBuilder(); + desc1.add(descriptionTextTerm.getLabel(), "First description of the dataset."); + + JsonObjectBuilder desc2 = Json.createObjectBuilder(); + desc2.add(descriptionTextTerm.getLabel(), "Second description with additional details."); + + JsonObjectBuilder desc3 = Json.createObjectBuilder(); + desc3.add(descriptionTextTerm.getLabel(), "Third description for completeness."); + + descriptionsBuilder.add(desc1); + descriptionsBuilder.add(desc2); + descriptionsBuilder.add(desc3); + + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + // Assert + assertNotNull(infoFile); + // Multiple descriptions should be concatenated with commas as per getSingleValue method + assertTrue(infoFile.contains("External-Description: First description of the dataset.,Second description with\r\n additional details.,Third description for completeness.")); + } + + @Test + public void testGenerateInfoFileWithRequiredFields() throws Exception { + // Arrange - minimal setup with required fields already in setUp() + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "Test Contact"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "Test description"); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: Test Contact")); + assertTrue(infoFile.contains("External-Description: Test description")); + assertTrue(infoFile.contains("Source-Organization:")); + assertTrue(infoFile.contains("Organization-Address:")); + assertTrue(infoFile.contains("Organization-Email:")); + assertTrue(infoFile.contains("Bagging-Date:")); + assertTrue(infoFile.contains("External-Identifier: doi:10.5072/FK2/TEST123")); + assertTrue(infoFile.contains("Bag-Size:")); + assertTrue(infoFile.contains("Payload-Oxum: 1024000.10")); + assertTrue(infoFile.contains("Internal-Sender-Identifier: Test Catalog:Test Dataset")); + } + + @Test + public void testGenerateInfoFileWithDifferentBagSizes() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(null); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(null); + + initializeBagGenerator(); + + // Test with bytes + setPrivateField(bagGenerator, "totalDataSize", 512L); + setPrivateField(bagGenerator, "dataCount", 5L); + String infoFile1 = invokeGenerateInfoFile(); + assertTrue(infoFile1.contains("Bag-Size: 512 bytes")); + assertTrue(infoFile1.contains("Payload-Oxum: 512.5")); + + // Test with KB + setPrivateField(bagGenerator, "totalDataSize", 2048L); + setPrivateField(bagGenerator, "dataCount", 3L); + String infoFile2 = invokeGenerateInfoFile(); + assertTrue(infoFile2.contains("Bag-Size: 2.05 KB")); + assertTrue(infoFile2.contains("Payload-Oxum: 2048.3")); + + // Test with MB + setPrivateField(bagGenerator, "totalDataSize", 5242880L); + setPrivateField(bagGenerator, "dataCount", 100L); + String infoFile3 = invokeGenerateInfoFile(); + assertTrue(infoFile3.contains("Bag-Size: 5.24 MB")); + assertTrue(infoFile3.contains("Payload-Oxum: 5242880.100")); + + // Test with GB + setPrivateField(bagGenerator, "totalDataSize", 2147483648L); + setPrivateField(bagGenerator, "dataCount", 1000L); + + String infoFile4 = invokeGenerateInfoFile(); + assertTrue(infoFile4.contains("Bag-Size: 2.15 GB")); + assertTrue(infoFile4.contains("Payload-Oxum: 2147483648.1000")); + } + + // Helper methods + + /** + * Invokes the private generateInfoFile method using reflection + */ + private String invokeGenerateInfoFile() throws Exception { + Method method = BagGenerator.class.getDeclaredMethod("generateInfoFile"); + method.setAccessible(true); + return (String) method.invoke(bagGenerator); + } + + /** + * Sets a private field value using reflection + */ + private void setPrivateField(Object target, String fieldName, Object value) throws Exception { + Field field = BagGenerator.class.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } +} \ No newline at end of file From 24d098a0f70dff33c6ca48049ed0e668e8809792 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 17:00:19 -0500 Subject: [PATCH 24/25] QDR updates to apache 5, better fault tolerance for file retrieval --- .../iq/dataverse/util/bagit/BagGenerator.java | 172 +++++++++++------- 1 file changed, 111 insertions(+), 61 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 3c82a9719d3..5c5b88a521b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -4,12 +4,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.InterruptedIOException; import java.io.OutputStream; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; @@ -46,23 +49,24 @@ import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; -import org.apache.http.client.ClientProtocolException; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.config.Registry; -import org.apache.http.config.RegistryBuilder; -import org.apache.http.conn.socket.ConnectionSocketFactory; -import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.NoopHostnameVerifier; -import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.conn.ssl.TrustSelfSignedStrategy; -import org.apache.http.ssl.SSLContextBuilder; -import org.apache.http.util.EntityUtils; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.ClientProtocolException; +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.config.RequestConfig; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.protocol.HttpClientContext; +import org.apache.hc.client5.http.socket.ConnectionSocketFactory; +import org.apache.hc.client5.http.socket.PlainConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; +import org.apache.hc.client5.http.ssl.SSLConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.TrustSelfSignedStrategy; +import org.apache.hc.core5.http.HttpEntity; +import org.apache.hc.core5.http.config.Registry; +import org.apache.hc.core5.http.config.RegistryBuilder; +import org.apache.hc.core5.ssl.SSLContextBuilder; +import org.apache.hc.core5.util.Timeout; import org.json.JSONArray; import com.google.gson.JsonArray; import com.google.gson.JsonElement; @@ -103,10 +107,11 @@ public class BagGenerator { private HashMap pidMap = new LinkedHashMap(); private HashMap checksumMap = new LinkedHashMap(); - private int timeout = 60; - private RequestConfig config = RequestConfig.custom().setConnectTimeout(timeout * 1000) - .setConnectionRequestTimeout(timeout * 1000).setSocketTimeout(timeout * 1000) - .setCookieSpec(CookieSpecs.STANDARD).build(); + private int timeout = 300; + private RequestConfig config = RequestConfig.custom() + .setConnectionRequestTimeout(Timeout.ofSeconds(timeout)) + .setResponseTimeout(Timeout.ofSeconds(timeout)) + .build(); protected CloseableHttpClient client; private PoolingHttpClientConnectionManager cm = null; @@ -131,7 +136,7 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); private OREMap oremap; @@ -152,6 +157,11 @@ public class BagGenerator { private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + // Implement exponential backoff with jitter + static final long baseWaitTimeMs = 1000; // Start with 1 second + static final long maxWaitTimeMs = 30000; // Cap at 30 seconds + + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -189,8 +199,10 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), - NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory( + builder.build(), + NoopHostnameVerifier.INSTANCE + ); Registry registry = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.getSocketFactory()) @@ -200,11 +212,14 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio cm.setDefaultMaxPerRoute(numConnections); cm.setMaxTotal(numConnections > 20 ? numConnections : 20); - client = HttpClients.custom().setConnectionManager(cm).setDefaultRequestConfig(config).build(); + client = HttpClients.custom() + .setConnectionManager(cm) + .setDefaultRequestConfig(config) + .build(); scatterZipCreator = new ParallelScatterZipCreator(Executors.newFixedThreadPool(numConnections)); } catch (NoSuchAlgorithmException | KeyManagementException e) { - logger.warning("Aint gonna work"); + logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } } @@ -424,7 +439,6 @@ public boolean generateBag(String bagName, boolean temp) { } } - @SuppressWarnings("deprecation") public void validateBag(String bagId) { logger.info("Validating Bag"); ZipFile zf = null; @@ -1156,6 +1170,10 @@ private HttpGet createNewGetRequest(URI url, String returnType) { return request; } + /** Get a stream supplier for the given URI. + * + * Caller must close the stream when done. + */ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { @@ -1168,56 +1186,88 @@ public InputStream get() { logger.fine("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); - logger.finest("Retrieving " + tries + ": " + uriString); - CloseableHttpResponse response = null; + try { - response = client.execute(getFile); - // Note - if we ever need to pass an HttpClientContext, we need a new one per - // thread. - int statusCode = response.getStatusLine().getStatusCode(); + // Execute the request directly and keep the response open + final CloseableHttpResponse response = (CloseableHttpResponse) client.executeOpen(null, getFile, HttpClientContext.create()); + int statusCode = response.getCode(); + if (statusCode == 200) { logger.finest("Retrieved: " + uri); - return response.getEntity().getContent(); - } - logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString - + " : " + statusCode); - if (statusCode < 500) { - logger.fine("Will not retry for 40x errors"); - tries += 5; + // Return a wrapped stream that will close the response when the stream is closed + final HttpEntity entity = response.getEntity(); + if (entity != null) { + // Create a wrapper stream that closes the response when the stream is closed + return new FilterInputStream(entity.getContent()) { + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + response.close(); + } + } + }; + } else { + response.close(); + logger.warning("No content in response for: " + uriString); + return null; + } } else { + // Close the response for non-200 responses + response.close(); + + logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString + + " : " + statusCode); tries++; - } - // Error handling - if (response != null) { try { - EntityUtils.consumeQuietly(response.getEntity()); - response.close(); - } catch (IOException io) { - logger.warning( - "Exception closing response after status: " + statusCode + " on " + uri); + // Calculate exponential backoff: 2^tries * baseWaitTimeMs (1 sec) + long waitTime = (long) (Math.pow(2, tries) * baseWaitTimeMs); + + // Add jitter: random value between 0-30% of the wait time + long jitter = (long) (waitTime * 0.3 * Math.random()); + waitTime = waitTime + jitter; + + // Cap the wait time at maxWaitTimeMs (30 seconds) + waitTime = Math.min(waitTime, maxWaitTimeMs); + + logger.fine("Sleeping for " + waitTime + "ms before retry attempt " + tries); + Thread.sleep(waitTime); + } catch (InterruptedException ie) { + logger.log(Level.SEVERE, "InterruptedException during retry delay for file: " + uriString, ie); + Thread.currentThread().interrupt(); // Restore interrupt status + tries += 5; // Skip remaining attempts } } } catch (ClientProtocolException e) { tries += 5; - e.printStackTrace(); + logger.log(Level.SEVERE, "ClientProtocolException when retrieving file: " + uriString + " (attempt " + tries + ")", e); + } catch (SocketTimeoutException e) { + // Specific handling for timeout exceptions + tries++; + logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of 5) - Request exceeded timeout", e); + if (tries == 5) { + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries due to timeouts: " + uriString, e); + } + } catch (InterruptedIOException e) { + // Catches interruptions during I/O operations + tries += 5; + logger.log(Level.SEVERE, "InterruptedIOException when retrieving file: " + uriString + " - Operation was interrupted", e); + Thread.currentThread().interrupt(); // Restore interrupt status } catch (IOException e) { - // Retry if this is a potentially temporary error such - // as a timeout + // Retry if this is a potentially temporary error such as a timeout tries++; - logger.log(Level.WARNING, "Attempt# " + tries + " : Unable to retrieve file: " + uriString, - e); + logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of 5)", e); if (tries == 5) { - logger.severe("Final attempt failed for " + uriString); + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries: " + uriString, e); } - e.printStackTrace(); } } - } catch (URISyntaxException e) { - e.printStackTrace(); + logger.log(Level.SEVERE, "URISyntaxException for file: " + uriString + " - Invalid URI format", e); } - logger.severe("Could not read: " + uriString); + logger.severe("FAILED TO RETRIEVE FILE after all retries: " + uriString); return null; } }; @@ -1268,9 +1318,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will use " + numConnections + " threads"); } } \ No newline at end of file From b4a3799ca82aa48e299e8d5a4351da62b4cad29c Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 17:06:56 -0500 Subject: [PATCH 25/25] release note update --- doc/release-notes/12063-ORE-and-Bag-updates.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md index e276232f33a..b2926f40c96 100644 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -10,4 +10,5 @@ Archival Bag - a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed - values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). - the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) -- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation \ No newline at end of file +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling \ No newline at end of file