From 1b2a5f34c927ba06157b7679ae5e2a6b8b8b045a Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Sat, 25 Apr 2026 01:15:34 +0200 Subject: [PATCH 01/11] feat(java,flyway): add Maven configuration for Flyway database migration for Dataverse Upgrades --- java/update-by-flyway/pom.xml | 305 ++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 java/update-by-flyway/pom.xml diff --git a/java/update-by-flyway/pom.xml b/java/update-by-flyway/pom.xml new file mode 100644 index 0000000..379ce98 --- /dev/null +++ b/java/update-by-flyway/pom.xml @@ -0,0 +1,305 @@ + + + 4.0.0 + + + io.gdcc + parent + 0.13.1 + + + migrate-db + 1.0.0-SNAPSHOT + pom + migrate-db + + + + true + true + true + true + + https://github.com/IQSS/dataverse.git + v6.10.1 + ${project.build.directory}/dataverse + src/main/resources/db/migration + + 12.4.0 + 0.48.1 + 42.7.10 + + true + postgres,pg_dump + false + ${docker.skip} + ${docker.skip} + ${docker.skip} + true + true + + 16 + localhost + 15432 + dataverse + ${postgresql.db} + supersecret + ${project.basedir}/db_dump.sql + sql + ${project.build.directory} + + 30 + + + + false + false + + + + + + org.apache.maven.plugins + maven-scm-plugin + 2.2.1 + + scm:git:${repo.url} + ${repo.directory} + tag + ${repo.tag} + true + + + + download + prepare-package + + checkout + + + + + + io.fabric8 + docker-maven-plugin + ${dmp.version} + + true + + + + postgres + postgres:${postgresql.server.version} + + true + + + + custom + migration + migrationdb + + + localhost:${postgresql.port}:5432 + + + ${postgresql.username} + ${postgresql.password} + ${postgresql.db} + + + + ${postgresql.dump.file}:/docker-entrypoint-initdb.d/dump.${postgresql.dump.file.ext} + + + + + + + + + 5432 + + + + + + + + pg_dump + postgres:${postgresql.server.version} + + true + + + + custom + migration + + + ${postgresql.username} + ${postgresql.password} + + + + ${postgresql.dump.target}:/dumptarget + + + + + + ${postgresql.password} + + + + + sh + -c + pg_dump -h migrationdb -p 5432 -U ${postgresql.username} -v -f /dumptarget/migrated_db_dump.sql ${postgresql.db} + + + + + + + + + + + + cleanup + clean + + stop + + + + start + prepare-package + + start + + + ${docker.skipStart} + + + postgres + + false + + + + pg_dump + + true + + + + + + + dump + verify + + start + + + ${docker.skipDump} + + + postgres + + true + + + + pg_dump + + false + + + + + + + stop + install + + stop + + + ${docker.skipStop} + + + + + + org.flywaydb + flyway-maven-plugin + ${flyway.version} + + + org.flywaydb + flyway-core + ${flyway.version} + + + org.flywaydb + flyway-database-postgresql + ${flyway.version} + + + org.postgresql + postgresql + ${postgresql.version} + + + + jdbc:postgresql://${postgresql.host}:${postgresql.port}/${postgresql.db} + ${postgresql.username} + ${postgresql.password} + __ + + + + migrate + package + + migrate + + + true + + filesystem:${repo.directory}/${repo.subpath} + filesystem:extra-migrations + + + ${migrate.keywordTermUri} + ${migrate.cleanupSavedSearches} + + + + + cleanup + package + + repair + + + + filesystem:${repo.directory}/${repo.subpath} + + + + + + + + From d33ebc87230ac109067e53745a29afd31b849da8 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Sat, 25 Apr 2026 01:16:05 +0200 Subject: [PATCH 02/11] feat(java,flyway): add extra migration scripts for Dataverse upgrades --- .../V4.20.0.6__remove-ejb-timer.sql | 2 + .../V5.1.1.0__prepare-table-exttooltype.sql | 10 ++ .../V5.10.0.0__8600-orphaned-templates.sql | 115 ++++++++++++++++++ ..._7398-cleanup-saved-searches-and-links.sql | 93 ++++++++++++++ ....0.8__7687-file-access-flag-update-bug.sql | 30 +++++ .../V5.4.1.0__prepare-table-auxfile.sql | 15 +++ .../V5.9.0.0__prepare-table-license.sql | 15 +++ ...0.0__prepare-tables-embargo-storageuse.sql | 17 +++ .../V6.1.0.0__prepare-table-extvocab.sql | 12 ++ .../V6.2.0.2__migrate_keywordTermUri.sql | 44 +++++++ .../V6.3.0.0__prepare-table-datasettype.sql | 29 +++++ ...V6.5.0.0__prepare-table-mdc-proc-state.sql | 12 ++ ...0__prepare-table-dataversefeatureditem.sql | 16 +++ ...te__2-8191-check-restricted-file-terms.sql | 69 +++++++++++ 14 files changed, 479 insertions(+) create mode 100644 java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql create mode 100644 java/update-by-flyway/extra-migrations/V5.1.1.0__prepare-table-exttooltype.sql create mode 100644 java/update-by-flyway/extra-migrations/V5.10.0.0__8600-orphaned-templates.sql create mode 100644 java/update-by-flyway/extra-migrations/V5.3.0.7__7398-cleanup-saved-searches-and-links.sql create mode 100644 java/update-by-flyway/extra-migrations/V5.3.0.8__7687-file-access-flag-update-bug.sql create mode 100644 java/update-by-flyway/extra-migrations/V5.4.1.0__prepare-table-auxfile.sql create mode 100644 java/update-by-flyway/extra-migrations/V5.9.0.0__prepare-table-license.sql create mode 100644 java/update-by-flyway/extra-migrations/V6.0.0.0__prepare-tables-embargo-storageuse.sql create mode 100644 java/update-by-flyway/extra-migrations/V6.1.0.0__prepare-table-extvocab.sql create mode 100644 java/update-by-flyway/extra-migrations/V6.2.0.2__migrate_keywordTermUri.sql create mode 100644 java/update-by-flyway/extra-migrations/V6.3.0.0__prepare-table-datasettype.sql create mode 100644 java/update-by-flyway/extra-migrations/V6.5.0.0__prepare-table-mdc-proc-state.sql create mode 100644 java/update-by-flyway/extra-migrations/V6.6.0.0__prepare-table-dataversefeatureditem.sql create mode 100644 java/update-by-flyway/extra-migrations/afterMigrate__2-8191-check-restricted-file-terms.sql diff --git a/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql b/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql new file mode 100644 index 0000000..de29aa0 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql @@ -0,0 +1,2 @@ +-- +DROP TABLE IF EXISTS EJB__TIMER__TBL; diff --git a/java/update-by-flyway/extra-migrations/V5.1.1.0__prepare-table-exttooltype.sql b/java/update-by-flyway/extra-migrations/V5.1.1.0__prepare-table-exttooltype.sql new file mode 100644 index 0000000..a74c8aa --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V5.1.1.0__prepare-table-exttooltype.sql @@ -0,0 +1,10 @@ +-- This is a workaround for the missing DDL statements in migration V5.1.1.2 + +CREATE TABLE IF NOT EXISTS externaltooltype +( + id SERIAL PRIMARY KEY, + type VARCHAR(255) NOT NULL, + externalTool_id BIGINT NOT NULL CONSTRAINT fk_externaltooltype_externaltool_id REFERENCES externaltool (id) +); + +CREATE INDEX IF NOT EXISTS index_externaltooltype_externaltool_id ON externaltooltype (externaltool_id); diff --git a/java/update-by-flyway/extra-migrations/V5.10.0.0__8600-orphaned-templates.sql b/java/update-by-flyway/extra-migrations/V5.10.0.0__8600-orphaned-templates.sql new file mode 100644 index 0000000..d10e0e3 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V5.10.0.0__8600-orphaned-templates.sql @@ -0,0 +1,115 @@ +-- Migration script to delete orphan templates (templates with no associated dataverse), see also issue #8600 + +DO +$$ + DECLARE + orphan_templates_count INTEGER; + affected_collections_count INTEGER; + row_count INTEGER; + BEGIN + -- Get the count of orphan templates + SELECT COUNT(t.id) INTO orphan_templates_count + FROM template t + WHERE dataverse_id IS NULL; + + -- Count dataverse collections that use orphan templates as default + SELECT COUNT(*) INTO affected_collections_count + FROM dataverse d + WHERE d.defaulttemplate_id IN ( + SELECT t.id FROM template t WHERE dataverse_id IS NULL + ); + + -- Only execute queries if the affected count is greater than 0 + IF orphan_templates_count > 0 THEN + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + RAISE NOTICE 'Found % orphan templates (templates with no associated dataverse)', orphan_templates_count; + RAISE NOTICE 'Found % dataverses using orphan templates as their default template', affected_collections_count; + + -- Please note: The below is quite a bunch of things we need to execute. When using the Admin API call + -- to delete the template, all the cascading is handled by JPA for us. We don't have that... + + -- First, update all dataverses that use orphan templates as default template + UPDATE dataverse + SET defaulttemplate_id = NULL + WHERE defaulttemplate_id IN ( + SELECT t.id FROM template t WHERE dataverse_id IS NULL + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Updated % collections to remove orphan templates set as default', row_count; + + -- Create a temporary table to keep track of datasetfields to delete + CREATE TEMPORARY TABLE temp_datasetfields_to_delete AS + SELECT id FROM datasetfield + WHERE template_id IN ( + SELECT t.id FROM template t WHERE dataverse_id IS NULL + ); + + -- Create a temporary table to keep track of compound values to delete + CREATE TEMPORARY TABLE temp_compoundvalues_to_delete AS + SELECT cv.id + FROM datasetfieldcompoundvalue cv + WHERE cv.parentdatasetfield_id IN ( + SELECT id FROM temp_datasetfields_to_delete + ); + + -- Delete mappings between datasetfield and controlledvocabularyvalues + DELETE FROM datasetfield_controlledvocabularyvalue + WHERE datasetfield_id IN ( + SELECT id FROM temp_datasetfields_to_delete + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % vocabulary mappings associated with orphan templates', row_count; + + -- Delete datasetfieldvalue records that reference the datasetfields we're going to delete + DELETE FROM datasetfieldvalue + WHERE datasetfield_id IN ( + SELECT id FROM temp_datasetfields_to_delete + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % datasetfieldvalues associated with orphan templates', row_count; + + -- Break the circular reference by setting parentdatasetfieldcompoundvalue_id to NULL + UPDATE datasetfield + SET parentdatasetfieldcompoundvalue_id = NULL + WHERE parentdatasetfieldcompoundvalue_id IN ( + SELECT id FROM temp_compoundvalues_to_delete + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Updated % datasetfields to remove references to compound values', row_count; + + -- Now we can safely delete the compound values + DELETE FROM datasetfieldcompoundvalue + WHERE id IN ( + SELECT id FROM temp_compoundvalues_to_delete + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % datasetfieldcompoundvalues associated with orphan templates', row_count; + + -- Delete datasetfields that reference orphan templates + DELETE FROM datasetfield + WHERE id IN ( + SELECT id FROM temp_datasetfields_to_delete + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % datasetfields referencing orphan templates', row_count; + + -- Clean up temporary tables + DROP TABLE temp_datasetfields_to_delete; + DROP TABLE temp_compoundvalues_to_delete; + + -- Then finally delete all orphan templates + DELETE FROM template + WHERE dataverse_id IS NULL; + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % orphan templates', row_count; + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + END IF; + END +$$; \ No newline at end of file diff --git a/java/update-by-flyway/extra-migrations/V5.3.0.7__7398-cleanup-saved-searches-and-links.sql b/java/update-by-flyway/extra-migrations/V5.3.0.7__7398-cleanup-saved-searches-and-links.sql new file mode 100644 index 0000000..ac3eab9 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V5.3.0.7__7398-cleanup-saved-searches-and-links.sql @@ -0,0 +1,93 @@ +-- +DO +$$ + DECLARE + should_delete BOOLEAN := CASE WHEN UPPER('${V5_4_cleanup_searches_and_links}') = 'TRUE' THEN TRUE ELSE FALSE END; + affected_searches_count INTEGER; + affected_linked_datasets_count INTEGER; + affected_linked_collections_count INTEGER; + row_count INTEGER; + BEGIN + -- Get the count of rows that match the criteria + SELECT COUNT(*) INTO affected_searches_count + from savedsearch ss, savedsearchfilterquery ssfq, dataverselinkingdataverse dld + where ss.id = ssfq.savedsearch_id + and ss.definitionpoint_id = dld.linkingdataverse_id + and dld.dataverse_id = rtrim(reverse(split_part(reverse(ssfq.filterquery),'/',1)),'"')::integer + and ss.query='*' + and ssfq.filterquery like 'subtreePaths%'; + + select COUNT(*) INTO affected_linked_datasets_count + from datasetlinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld + where dld.dataset_id = dvo.id + and dld.linkingdataverse_id = dvld.linkingdataverse_id + and dvo.owner_id = dvld.dataverse_id; + + select COUNT(*) INTO affected_linked_collections_count + from dataverselinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld + where dld.dataverse_id = dvo.id + and dld.linkingdataverse_id = dvld.linkingdataverse_id + and dvo.owner_id = dvld.dataverse_id; + + -- Only show hint if count is greater than 0 + IF affected_searches_count > 0 OR affected_linked_datasets_count > 0 OR affected_linked_collections_count > 0 THEN + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + RAISE NOTICE 'Found % saved searches affected by issue #7398', affected_searches_count; + RAISE NOTICE 'Found % dataset links affected by issue #7398', affected_linked_datasets_count; + RAISE NOTICE 'Found % collections links affected by issue #7398', affected_linked_collections_count; + + IF should_delete THEN + RAISE NOTICE 'Cleaning up Saved Searches and Linked Datasets as requested by -Dmigrate.cleanupSavedSearches'; + + -- delete the saved searches identified using the ss_for_deletion query + create temporary table delete_ss on commit drop as ( + Select ss.id + from savedsearch ss, savedsearchfilterquery ssfq, dataverselinkingdataverse dld + where ss.id = ssfq.savedsearch_id + and ss.definitionpoint_id = dld.linkingdataverse_id + and dld.dataverse_id = rtrim(reverse(split_part(reverse(ssfq.filterquery),'/',1)),'"')::integer + and ss.query='*' + and ssfq.filterquery like 'subtreePaths%' + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + + delete from savedsearchfilterquery where savedsearch_id in (select id from delete_ss); + delete from savedsearch where id in (select id from delete_ss); + + RAISE NOTICE 'Deleted % saved searches', row_count; + + COMMIT; + + -- delete linked objects identified using the query in dld_for_deletion + delete from datasetlinkingdataverse where id in ( + select dld.id + from datasetlinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld + where dld.dataset_id = dvo.id + and dld.linkingdataverse_id = dvld.linkingdataverse_id + and dvo.owner_id = dvld.dataverse_id + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % linked datasets', row_count; + + delete from dataverselinkingdataverse where id in ( + select dld.id + from dataverselinkingdataverse dld, dvobject dvo, dataverselinkingdataverse dvld + where dld.dataverse_id = dvo.id + and dld.linkingdataverse_id = dvld.linkingdataverse_id + and dvo.owner_id = dvld.dataverse_id + ); + + GET DIAGNOSTICS row_count = ROW_COUNT; + RAISE NOTICE 'Deleted % linked collections', row_count; + + COMMIT; + + ELSE + RAISE NOTICE 'Auto-migrate these using `mvn -Dmigrate.cleanupSavedSearches ...` (re-execute migrations)'; + END IF; + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + END IF; + END +$$; diff --git a/java/update-by-flyway/extra-migrations/V5.3.0.8__7687-file-access-flag-update-bug.sql b/java/update-by-flyway/extra-migrations/V5.3.0.8__7687-file-access-flag-update-bug.sql new file mode 100644 index 0000000..60f87ac --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V5.3.0.8__7687-file-access-flag-update-bug.sql @@ -0,0 +1,30 @@ +-- this query will identify datasets where a superuser has run the Curate command and the update included a change to +-- the fileaccessrequest flag, resulting in the file access request updates not being reflected in the published version +DO +$$ + DECLARE + -- should_migrate BOOLEAN := CASE WHEN UPPER('...') = 'TRUE' THEN TRUE ELSE FALSE END; + affected_count INTEGER; + BEGIN + -- Get the count of rows that match the criteria + SELECT COUNT(*) INTO affected_count + from datasetversion dv, termsofuseandaccess ta, dataset da + where dv.dataset_id=da.id + and dv.termsofuseandaccess_id=ta.id + and ta.fileaccessrequest != da.fileaccessrequest + and dv.versionstate='RELEASED' + and dv.releasetime in (select max(releasetime) + from datasetversion + where dataset_id=da.id); + + -- Only show hint if count is greater than 0 + IF affected_count > 0 THEN + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + RAISE NOTICE 'Found % datasets affected by issue #7687', affected_count; + RAISE NOTICE 'For now, please fix these manually. See Dataverse v5.4 release notes about #7687.'; + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + END IF; + + -- TODO: an opt-in migration to fix them all would be nice! + END; +$$; \ No newline at end of file diff --git a/java/update-by-flyway/extra-migrations/V5.4.1.0__prepare-table-auxfile.sql b/java/update-by-flyway/extra-migrations/V5.4.1.0__prepare-table-auxfile.sql new file mode 100644 index 0000000..071c80c --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V5.4.1.0__prepare-table-auxfile.sql @@ -0,0 +1,15 @@ +-- This is a workaround for the missing auxiliaryfile table in migration V5.4.1.1 + +create table if not exists auxiliaryfile +( + id serial primary key, + checksum varchar(255), + contenttype varchar(255), + filesize bigint, + formattag varchar(255), + formatversion varchar(255), + ispublic boolean, + origin varchar(255), + type varchar(255), + datafile_id bigint not null constraint fk_auxiliaryfile_datafile_id references dvobject +); diff --git a/java/update-by-flyway/extra-migrations/V5.9.0.0__prepare-table-license.sql b/java/update-by-flyway/extra-migrations/V5.9.0.0__prepare-table-license.sql new file mode 100644 index 0000000..df66ccf --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V5.9.0.0__prepare-table-license.sql @@ -0,0 +1,15 @@ +-- This is a workaround for the missing license table in migration V5.9.0.1 + +create table if not exists license +( + id serial primary key, + active boolean not null, + iconurl text, + isdefault boolean not null, + name text constraint unq_license_0 unique, + shortdescription text, + sortorder bigint default 0 not null, + uri text constraint unq_license_1 unique +); + +create index if not exists license_sortorder_id on license (sortorder, id); diff --git a/java/update-by-flyway/extra-migrations/V6.0.0.0__prepare-tables-embargo-storageuse.sql b/java/update-by-flyway/extra-migrations/V6.0.0.0__prepare-tables-embargo-storageuse.sql new file mode 100644 index 0000000..9d57a82 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V6.0.0.0__prepare-tables-embargo-storageuse.sql @@ -0,0 +1,17 @@ +-- This is a workaround for the missing embargo table in migration V6.0.0.2 +create table if not exists embargo +( + id serial primary key, + dateavailable date not null, + reason text +); + +-- This is a workaround for the missing storageuse table in migration V6.0.0.5 +create table if not exists storageuse +( + id serial primary key, + sizeinbytes bigint, + dvobjectcontainer_id bigint not null constraint fk_storageuse_dvobjectcontainer_id references dvobject +); +create index if not exists index_storageuse_dvobjectcontainer_id on storageuse (dvobjectcontainer_id); + diff --git a/java/update-by-flyway/extra-migrations/V6.1.0.0__prepare-table-extvocab.sql b/java/update-by-flyway/extra-migrations/V6.1.0.0__prepare-table-extvocab.sql new file mode 100644 index 0000000..7614603 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V6.1.0.0__prepare-table-extvocab.sql @@ -0,0 +1,12 @@ +-- This is a workaround for the missing externalvocabularyvalue table in migration V6.1.0.3 +create table if not exists externalvocabularyvalue +( + id serial primary key, + lastupdatedate timestamp, + uri text constraint externalvocabularvalue_uri_key unique, + value text +); + +create index if not exists index_externalvocabularyvalue_uri on externalvocabularyvalue (uri); + + diff --git a/java/update-by-flyway/extra-migrations/V6.2.0.2__migrate_keywordTermUri.sql b/java/update-by-flyway/extra-migrations/V6.2.0.2__migrate_keywordTermUri.sql new file mode 100644 index 0000000..eaefa85 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V6.2.0.2__migrate_keywordTermUri.sql @@ -0,0 +1,44 @@ +-- See also +DO +$$ + DECLARE + should_migrate BOOLEAN := CASE WHEN UPPER('${V6_3_migrate_keywordTermUri}') = 'TRUE' THEN TRUE ELSE FALSE END; + keyword_count INTEGER; + BEGIN + -- Get the count of rows that match the criteria + SELECT COUNT(*) INTO keyword_count + FROM datasetfieldvalue dfv INNER JOIN datasetfield df ON df.id = dfv.datasetfield_id + WHERE df.datasetfieldtype_id = (SELECT id FROM datasetfieldtype WHERE name = 'keywordValue') + AND value ILIKE 'http%'; + + -- Only show hint if count is greater than 0 + IF keyword_count > 0 THEN + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + RAISE NOTICE 'Found % keywordValue metadata fields starting with "http"', keyword_count; + RAISE NOTICE 'See release notes of Dataverse 6.3 to learn more about migrating these to the keywordTermURI field.'; + + /* + To execute this migration, the keywordTermURI metadata field needs to be loaded. + So far, we never added metadata fields using a migration. Maybe it's fine to keep it a manual task for now. + + IF should_migrate THEN + RAISE NOTICE 'Migrating keywordValue fields with http... to keywordTermUri as requested by -Dmigrate.keywordTermUri'; + UPDATE datasetfield df + SET datasetfieldtype_id = (SELECT id FROM datasetfieldtype WHERE name = 'keywordTermURI') + FROM datasetfieldvalue dfv + WHERE dfv.datasetfield_id = df.id + AND df.datasetfieldtype_id = (SELECT id FROM datasetfieldtype WHERE name = 'keywordValue') + AND dfv.value ILIKE 'http%'; + + GET DIAGNOSTICS keyword_count = ROW_COUNT; + RAISE NOTICE 'Updated % rows', keyword_count; + + ELSE + RAISE NOTICE 'Auto-migrate these into keywordTermUri using `mvn -Dmigrate.keywordTermUri ...` (re-execute migrations)'; + END IF; + */ + + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + END IF; + END +$$; \ No newline at end of file diff --git a/java/update-by-flyway/extra-migrations/V6.3.0.0__prepare-table-datasettype.sql b/java/update-by-flyway/extra-migrations/V6.3.0.0__prepare-table-datasettype.sql new file mode 100644 index 0000000..e326c1e --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V6.3.0.0__prepare-table-datasettype.sql @@ -0,0 +1,29 @@ +-- This is a workaround for the missing dataset type tables in migration V6.3.0.3 + +create table if not exists datasettype +( + id serial primary key, + name varchar(255) not null constraint unq_datasettype_0 unique +); + +create table if not exists datasettype_licenses +( + datasettype_id bigint not null + constraint fk_datasettype_license_datasettype_id + references datasettype, + licenses_id bigint not null + constraint fk_datasettype_license_licenses_id + references license, + primary key (datasettype_id, licenses_id) +); + +create table if not exists datasettype_metadatablocks +( + datasettype_id bigint not null + constraint fk_datasettype_metadatablock_datasettype_id + references datasettype, + metadatablocks_id bigint not null + constraint fk_datasettype_metadatablock_metadatablocks_id + references public.metadatablock, + primary key (datasettype_id, metadatablocks_id) +); \ No newline at end of file diff --git a/java/update-by-flyway/extra-migrations/V6.5.0.0__prepare-table-mdc-proc-state.sql b/java/update-by-flyway/extra-migrations/V6.5.0.0__prepare-table-mdc-proc-state.sql new file mode 100644 index 0000000..0c2f137 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V6.5.0.0__prepare-table-mdc-proc-state.sql @@ -0,0 +1,12 @@ +-- This is a workaround for the missing MakeDataCount tables in migration V6.5.0.10 + +CREATE TABLE IF NOT EXISTS MakeDataCountProcessState +( + id SERIAL PRIMARY KEY, + yearMonth VARCHAR(255) NOT NULL, + state INTEGER NOT NULL, + stateChangeTimestamp TIMESTAMP WITHOUT TIME ZONE, + server VARCHAR(255) +); + +CREATE INDEX IF NOT EXISTS index_makedatacountprocessstate_yearmonth ON MakeDataCountProcessState (yearMonth); diff --git a/java/update-by-flyway/extra-migrations/V6.6.0.0__prepare-table-dataversefeatureditem.sql b/java/update-by-flyway/extra-migrations/V6.6.0.0__prepare-table-dataversefeatureditem.sql new file mode 100644 index 0000000..15ca0d6 --- /dev/null +++ b/java/update-by-flyway/extra-migrations/V6.6.0.0__prepare-table-dataversefeatureditem.sql @@ -0,0 +1,16 @@ +-- This is a workaround for the missing tables in migration V6.6.0.2 + +create table if not exists dataversefeatureditem +( + id serial primary key, + content text, + displayorder integer not null, + imagefilename varchar(255), + type text, + dataverse_id bigint not null constraint fk_dataversefeatureditem_dataverse_id references dvobject, + dvobject_id bigint constraint fk_dataversefeatureditem_dvobject_id references dvobject +); + +create index if not exists index_dataversefeatureditem_displayorder on dataversefeatureditem (displayorder); +-- It's unclear why EclipseLink generated this particular index and with this name... Just going along with it. +create index if not exists index_harvestingclient_harvesttype on dataversefeatureditem (id); diff --git a/java/update-by-flyway/extra-migrations/afterMigrate__2-8191-check-restricted-file-terms.sql b/java/update-by-flyway/extra-migrations/afterMigrate__2-8191-check-restricted-file-terms.sql new file mode 100644 index 0000000..62e39ac --- /dev/null +++ b/java/update-by-flyway/extra-migrations/afterMigrate__2-8191-check-restricted-file-terms.sql @@ -0,0 +1,69 @@ +-- this query will identify datasets where at least one file does not have either terms of access or +-- request access enabled, and will include owner information for those datasets + +DO +$$ + DECLARE + affected_dataset RECORD; + row_count INTEGER := 0; + BEGIN + -- Create a temporary table to store the results + CREATE TEMPORARY TABLE IF NOT EXISTS affected_dataset_results ( + email TEXT, + name TEXT, + dataset_url TEXT + ); + + -- Insert the query results into the temporary table + INSERT INTO affected_dataset_results + select au.email, + concat(au.firstname, ' ', au.lastname) as name, + concat('dx.doi.org/' , dvo.authority , '/' , dvo.identifier) as dataset_url + from roleassignment ra, dataverserole dvr, + authenticateduser au, dvobject dvo + where + au.useridentifier = rtrim(substring(ra.assigneeidentifier, 2, 100)) + and dvo.id = ra.definitionpoint_id + and + ra.role_id = dvr.id and + dvr.alias in ( + 'fullContributor', + 'dsContributor', + 'contributor', + 'admin', + 'curator' + ) and + ra.definitionpoint_id in ( + select dvo.id from datasetversion v + join termsofuseandaccess ua on ua.id = v.termsofuseandaccess_id + join filemetadata fm on v.id = fm.datasetversion_id + join datafile f on f.id = fm.datafile_id + join dvobject dvo on v.dataset_id = dvo.id + where ua.fileaccessrequest = false and ua.termsofaccess isnull + and f.restricted = true + ); + + -- Get the number of affected rows + GET DIAGNOSTICS row_count = ROW_COUNT; + + -- Print notice if there are affected datasets + IF row_count > 0 THEN + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + RAISE NOTICE 'Found % dataset(s) with files lacking proper access settings.', row_count; + RAISE NOTICE 'For details see Dataverse 5.11 release notes and issue 8191.'; + + -- Loop through affected datasets and print details + FOR affected_dataset IN SELECT * FROM affected_dataset_results LOOP + RAISE NOTICE 'Dataset %, Owner % (%)', + affected_dataset.dataset_url, + affected_dataset.name, + affected_dataset.email; + END LOOP; + + RAISE NOTICE '--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---'; + END IF; + + -- Clean up temporary table + DROP TABLE IF EXISTS affected_dataset_results; + END; +$$; From b5af5bcd7e3a01a0619848cf6ef9d07b6d8c1d29 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Sat, 25 Apr 2026 01:16:49 +0200 Subject: [PATCH 03/11] docs(java,flyway): add Flyway migration README for experimental Dataverse upgrades --- README.md | 5 + java/update-by-flyway/.gitignore | 3 + java/update-by-flyway/README.md | 236 +++++++++++++++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 java/update-by-flyway/.gitignore create mode 100644 java/update-by-flyway/README.md diff --git a/README.md b/README.md index afad2ca..fad9858 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ The repository is organized by programming language and tool: - `shell/`: 🐚 Shell recipes - `dvcli/`: 🛠️ DVCLI recipes - `js/`: 🌐 JavaScript recipes +- `java/` ☕ Java recipes Each language directory contains specific recipes organized by functionality or use case. @@ -53,6 +54,10 @@ In the following sections, you can find a list of available recipes for each lan - [Metadata Block Management](js/metadatablocks) +### Java ☕ + +- [Dataverse Update By Flyway Migrations](java/update-by-flyway) + ## 🤝 Contributing We welcome contributions! To add a new recipe: diff --git a/java/update-by-flyway/.gitignore b/java/update-by-flyway/.gitignore new file mode 100644 index 0000000..4352e10 --- /dev/null +++ b/java/update-by-flyway/.gitignore @@ -0,0 +1,3 @@ +.flattened-pom.xml +target/ +/*.sql \ No newline at end of file diff --git a/java/update-by-flyway/README.md b/java/update-by-flyway/README.md new file mode 100644 index 0000000..e95ee21 --- /dev/null +++ b/java/update-by-flyway/README.md @@ -0,0 +1,236 @@ +# Dataverse Update By Flyway Migrations + +## Description 📝 + +This recipe helps Dataverse administrators and operators upgrade a Dataverse installation by applying Flyway migrations directly against a local PostgreSQL database restored from a SQL dump of the installation. + +It is intended for situations where you want to test or perform an upgrade without manually stepping through every intermediate Dataverse release one by one. +The required extra migrations are important because some database structures normally created during Dataverse setup are not present when working only from migration scripts. +These extra scripts help bridge that gap so later migrations can succeed. + +The recipe works by: + +1. checking out a specific tagged version of the Dataverse source repository (Maven SCM Plugin), +2. starting a local PostgreSQL database in Docker (Maven Docker Plugin), +3. restoring a provided database dump into that database, +4. running the Dataverse Flyway migration scripts while including the required extra migration scripts included with this recipe (Maven Flyway Plugin), +5. exporting the migrated database again as a new SQL dump. + +The main output artifact is: + +- `target/migrated_db_dump.sql` + +Note: this may also help you upgrade to newer PostgreSQL versions by restoring the migrated dump. + +## Prerequisites ✔️ + +Before using this recipe, make sure you have: + +- **Java** installed +- **Maven** installed +- **Docker** installed and running +- A **PostgreSQL dump file** created with `pg_dump` (from your installation) +- Enough disk space for: + - the checked-out Dataverse repository + - a temporary PostgreSQL container + - the restored database + - the final migrated dump + +### Important limitations + +- This recipe is *experimental* +- This recipe is *not* an officially supported Dataverse upgrade path (yet...) +- It is intended for *local/offline testing and migration experiments* +- The minimum supported Dataverse version is *v4.12* +- You should work from a *database snapshot or dump*, not a live production database! + +Because all migration work happens locally in Docker, it is generally safe to experiment with a production snapshot as long as you understand that this recipe is not an official upgrade mechanism. + +### Recommendations 🗒️ + +- Always start from a reliable `pg_dump` backup +- Prefer testing with a copy of production data, not the live database itself +- Review migration notices carefully, especially when enabling cleanup flags +- Keep in mind that this recipe is a practical migration aid, not an official Dataverse-supported upgrade method +- Validate the migrated database before using it further in any environment + +## Installation instructions 🔧 + +1. Clone this recipe repository and change your working directory to `java/update-by-flyway`. +2. Make sure Docker is running. +3. Place your PostgreSQL dump where the recipe can read it or override the dump file location with a Maven property (see below). + +## Usage examples 💻 + +### Basic usage + +Run the full migration workflow: +```bash +mvn install +``` + +This will: + +- check out the configured Dataverse tag +- start PostgreSQL in Docker +- restore your dump +- apply Flyway migrations +- apply the required extra migrations shipped with this recipe +- export the migrated database as `target/migrated_db_dump.sql` + +### Cleanup + +Stop containers and clean generated files: +```bash +mvn clean +``` + +### Use a different Dataverse tag + +To migrate using another Dataverse release tag: +```bash +mvn install -Drepo.tag=v6.10.1 +``` + +### Use a different input dump file + +```bash +mvn install -Dpostgresql.dump.file=/path/to/db_dump.sql +``` + +### Change PostgreSQL connection settings + +```bash +mvn install +-Dpostgresql.host=localhost +-Dpostgresql.port=15432 +-Dpostgresql.db=dataverse +-Dpostgresql.username=dataverse +-Dpostgresql.password=supersecret +``` + +Note: by setting `-Ddocker.skip` and configuring a Postgres connection to a live database, you can run the migrations +on a non-local database, too. + + +### Enable cleanup of affected saved searches and links + +This recipe includes a required extra migration for handling data related to Dataverse issue #7398. +By default, the script detects affected rows and prints notices. +To actually perform the cleanup automatically, enable: + +```bash +mvn install -Dmigrate.cleanupSavedSearches=true +``` + +This can remove: + +- affected saved searches +- affected linked datasets +- affected linked collections + +Use this only if you understand the data impact and want the migration to perform the cleanup instead of only reporting it. + +### Enable keyword term URI migration handling + +This recipe also includes a required extra migration related to `keywordValue` values that look like URLs and may need to become `keywordTermURI` values. + +Run with: +```bash +mvn install -Dmigrate.keywordTermUri=true +``` + +At the moment, this migration mainly serves as a detection and guidance step. +It checks for affected metadata values and emits notices explaining the situation. +This is useful when reviewing upgrade issues around Dataverse 6.3 and related metadata handling. + +## Important Maven properties ⚙️ + +These are the most useful properties to override when running the recipe. + +### Dataverse source selection + +- `repo.url` + Git URL of the Dataverse repository to check out (default: `https://github.com/IQSS/dataverse.git`) + +- `repo.tag` + Dataverse Git tag to use for migration scripts (default: `v6.10.1`) + +- `repo.directory` + Local checkout directory for the Dataverse repository (default: `${project.build.directory}/dataverse`) + +- `repo.subpath` + Path inside the checked-out repository that contains the Flyway migrations (default: `src/main/resources/db/migration`) + +### Input and output dump handling + +- `postgresql.dump.file` + Path to the input PostgreSQL dump file (default `db_dump.sql`) + +- `postgresql.dump.file.ext` + Extension of the input dump file (default: `sql`) + +- `postgresql.dump.target` + Directory where the migrated dump is written (default: `target`). + The resulting migrated dump is written as: `${postgresql.dump.target}/migrated_db_dump.sql` + +### PostgreSQL settings + +- `postgresql.server.version` + PostgreSQL Docker image version to use (default: `16`) + +- `postgresql.host` + Host used by Flyway to connect (default: `localhost`) + +- `postgresql.port` + Local port mapped to the Docker PostgreSQL container (default: `15432`) + +- `postgresql.db` + Database name (default: `dataverse`) + +- `postgresql.username` + Database user (default: `dataverse`) + +- `postgresql.password` + Database password (default: `supersecret`) + +- `postgresql.waitForSec` + Time to wait for PostgreSQL startup and dump operations (default: `30`) + +### Migration behavior flags + +- `migrate.cleanupSavedSearches` + Enables automatic cleanup for data affected by issue #7398 (default: `false`) + +- `migrate.keywordTermUri` + Enables handling related to keyword term URI migration checks (default: `false`) + +### Docker execution control + +These are mostly useful for debugging or partial reruns: + +- `docker.skip` +- `docker.skipStart` +- `docker.skipDump` +- `docker.skipStop` + +## Dependencies 📦 + +This recipe depends on you to provide: + +- **Maven** +- **Docker** + +It will automatically pull in these dependencies: +- **PostgreSQL Docker image** +- **Flyway Maven Plugin** +- **Flyway PostgreSQL support** +- **PostgreSQL JDBC driver** +- **Maven SCM Plugin** +- **Docker Maven Plugin** + +It also depends on access to the Dataverse Git repository, so the main migration scripts for the configured tag can be checked out locally. + +## Support 💬 + +For issues and questions, please open an issue in this repository or discuss on the Dataverse Zulip community channels if appropriate. From 470376271da83656f7e09bc13330fcb14c568a38 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Sat, 25 Apr 2026 23:59:31 +0200 Subject: [PATCH 04/11] docs(java,flyway): expand README with audit guidelines and safeguards for Flyway migrations --- java/update-by-flyway/README.md | 100 ++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/java/update-by-flyway/README.md b/java/update-by-flyway/README.md index e95ee21..2228dc7 100644 --- a/java/update-by-flyway/README.md +++ b/java/update-by-flyway/README.md @@ -46,6 +46,106 @@ Before using this recipe, make sure you have: Because all migration work happens locally in Docker, it is generally safe to experiment with a production snapshot as long as you understand that this recipe is not an official upgrade mechanism. +**Q: What about potential data migrations for metadata blocks, fields and CVs?** + +A: This tool produces a migrated database dump but does **not** load any TSV files. +The admin will perform the final TSV reload (with the target version's TSVs) as part of deploying the new Dataverse version, following the standard upgrade procedure. +The question below addresses whether skipping the *intermediate* TSV reloads (those that would have happened between the source and target versions in a release-by-release upgrade) is safe. + +We need to distinguish between "data definition migration" and "user data migration" scenarios. + +#### Data Definition Migration + +These are Flyway migrations that modify `metadatablock`, `datasetfieldtype`, `controlledvocabularyvalue`, or related definitional tables. + +- If a migration's `WHERE` clause finds the targeted rows in the source DB, it applies as intended. +- If the targeted rows aren't there (because the source DB pre-dates their introduction, or an admin already removed them), the migration silently affects zero rows. + Flyway considers this success. The end state is correct either way, because the target TSV either reintroduces what's needed or omits what's been removed. +- If a migration is written to fail loudly on missing state, we'll notice and can fix it. +- The only failure mode is a migration written too unspecifically (e.g., delete by hardcoded ID hitting an unintended row). + This is a pre-existing risk for any upgrade path, not specific to this tool. + +Update/rename migrations specifically can only target state introduced by a *previous* TSV reload, since Dataverse's upgrade process has always asked admins to reload TSVs *after* deploying, never before. +So on a too-old source DB they silently no-op, and on a sufficiently up-to-date source DB they apply normally - never silently wrong. + +#### User Data Migration + +If a Flyway migration updates user data (`datasetfield`, `datasetfieldvalue`) based on assumptions about which fields or CV values exist or have a certain state, we could be in trouble: +those assumptions may have been valid only after an intermediate TSV reload - which this tool skips. + +**Audit method**: search migrations for any reference to the metadata-block-related tables and their dependents (the pattern is intentionally broad and will produce false positives requiring manual review): + +```shell +grep -riEl '\b(metadatablock|datasetfieldtype|controlledvocabularyvalue|controlledvocabalternate|datasetfield|datasetfieldvalue|dataversefieldtypeinputlevel|dataversefacet|datasetfielddefaultvalue)\b' src/main/resources/db/migration/ +``` + +This grep covers only SQL migrations. If Java-based Flyway migrations are added in the future, they require separate auditing. + +**Audit results as of Dataverse 6.10.1**: + +- `V5.3.0.3__7551-expanded-compound-datasetfield-validation.sql` — modifies `datasetfieldtype.required` and `dataversefieldtypeinputlevel` based on parent/child relationships; does not touch user data. +- `V5.8.0.2__8018-invalid-characters.sql` — uniform character sanitization on `datasetfieldvalue`; no TSV-state assumptions. +- `V5.10.1.1__8533-semantic-updates.sql` — adds unique constraint on `datasetfieldtype.name`; schema-only. +- `V6.1.0.4__5645-geospatial-fieldname-fix.sql` — renames two `datasetfieldtype` rows by name; idempotent, no user-data impact. +- `V6.5.0.6.sql` / `V6.5.0.12.sql` — adds column and index on `dataversefieldtypeinputlevel`; schema-only. + +None of these update user-entered data based on assumptions about TSV-loaded state. +Schema changes, uniform sanitization, and idempotent renames only. ✅ + +#### Requirement for future migrations + +Any future migration that updates user data based on metadata-block state **must** explicitly verify its expected starting state and fail loudly if the state is absent or unexpected. +Two patterns to be aware of: + +**Existence checks** — when a migration assumes a particular field or CV value exists: + +```sql +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM datasetfieldtype WHERE name = 'expectedField') THEN + RAISE EXCEPTION 'Migration prerequisite missing: datasetfieldtype "expectedField" not found. Run the previous release''s TSV load first.'; + END IF; + -- ... rest of migration ... +END $$; +``` + +**Attribute-state checks** — when a migration assumes a field/CV row has a particular attribute value (e.g., `required=true`, a specific `fieldType`, a specific `displayOrder`, membership in a particular `metadatablock`). +This is the more dangerous case: the row exists, so an existence check passes, but the migration's logic depends on an attribute that may only have been set by a previous TSV reload. +Verify the attribute explicitly: + +```sql +DO $$ +DECLARE + expected_required boolean; + expected_fieldtype text; +BEGIN + SELECT required, fieldtype INTO expected_required, expected_fieldtype + FROM datasetfieldtype WHERE name = 'expectedField'; + + IF NOT FOUND THEN + RAISE EXCEPTION 'Migration prerequisite missing: datasetfieldtype "expectedField" not found.'; + END IF; + + IF expected_required IS DISTINCT FROM true OR expected_fieldtype IS DISTINCT FROM 'TEXT' THEN + RAISE EXCEPTION 'Migration prerequisite mismatch: datasetfieldtype "expectedField" has required=%, fieldtype=%, expected required=true, fieldtype=TEXT. Run the previous release''s TSV load first.', + expected_required, expected_fieldtype; + END IF; + + -- ... migration that depends on these attributes, e.g. updating + -- datasetfieldvalue rows based on the field being required ... +END $$; +``` + +The same pattern applies to `controlledvocabularyvalue` (e.g., verifying `strvalue`, `identifier`, or `datasetfieldtype_id` before using a CV value to update user data) and to `metadatablock` (verifying `name` or block membership). + +Both check patterns protect all upgrade paths — including this tool, release-by-release upgrades, and installations where admins have forgotten a TSV reload or have diverged locally from upstream definitions. + +#### Locally customized upstream metadata blocks + +For customized upstream metadata blocks (e.g., a modified `citation.tsv`), the risks are the same as with release-by-release upgrades: +the next TSV reload overwrites local customizations. +Admins with local customizations should diff their TSVs against upstream before running this tool and carefully reapply changes after the upgrade. + ### Recommendations 🗒️ - Always start from a reliable `pg_dump` backup From d44c29d0f7861c73af236fdef61a8af25deeb630 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Tue, 28 Apr 2026 09:59:44 +0200 Subject: [PATCH 05/11] build,docs(java,flyway): add support for local migration scripts Enable running custom migrations not included in this repo very easily. --- java/update-by-flyway/README.md | 44 +++++++++++++++++++++++++++------ java/update-by-flyway/pom.xml | 7 +++++- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/java/update-by-flyway/README.md b/java/update-by-flyway/README.md index 2228dc7..f076ae7 100644 --- a/java/update-by-flyway/README.md +++ b/java/update-by-flyway/README.md @@ -14,7 +14,8 @@ The recipe works by: 2. starting a local PostgreSQL database in Docker (Maven Docker Plugin), 3. restoring a provided database dump into that database, 4. running the Dataverse Flyway migration scripts while including the required extra migration scripts included with this recipe (Maven Flyway Plugin), -5. exporting the migrated database again as a new SQL dump. +5. running Flyway again but without the extra migrations and telling it to clean these up (which makes the dump compatible with Dataverse code again), +6. exporting the final migrated database as a new SQL dump, ready to be imported into the installation. The main output artifact is: @@ -22,6 +23,9 @@ The main output artifact is: Note: this may also help you upgrade to newer PostgreSQL versions by restoring the migrated dump. + + + ## Prerequisites ✔️ Before using this recipe, make sure you have: @@ -46,15 +50,15 @@ Before using this recipe, make sure you have: Because all migration work happens locally in Docker, it is generally safe to experiment with a production snapshot as long as you understand that this recipe is not an official upgrade mechanism. -**Q: What about potential data migrations for metadata blocks, fields and CVs?** +#### What about potential data migrations for metadata blocks, fields and CVs? -A: This tool produces a migrated database dump but does **not** load any TSV files. +This tool produces a migrated database dump but does **not** load any TSV files. The admin will perform the final TSV reload (with the target version's TSVs) as part of deploying the new Dataverse version, following the standard upgrade procedure. The question below addresses whether skipping the *intermediate* TSV reloads (those that would have happened between the source and target versions in a release-by-release upgrade) is safe. We need to distinguish between "data definition migration" and "user data migration" scenarios. -#### Data Definition Migration +##### Data Definition Migration These are Flyway migrations that modify `metadatablock`, `datasetfieldtype`, `controlledvocabularyvalue`, or related definitional tables. @@ -68,7 +72,7 @@ These are Flyway migrations that modify `metadatablock`, `datasetfieldtype`, `co Update/rename migrations specifically can only target state introduced by a *previous* TSV reload, since Dataverse's upgrade process has always asked admins to reload TSVs *after* deploying, never before. So on a too-old source DB they silently no-op, and on a sufficiently up-to-date source DB they apply normally - never silently wrong. -#### User Data Migration +##### User Data Migration If a Flyway migration updates user data (`datasetfield`, `datasetfieldvalue`) based on assumptions about which fields or CV values exist or have a certain state, we could be in trouble: those assumptions may have been valid only after an intermediate TSV reload - which this tool skips. @@ -92,7 +96,7 @@ This grep covers only SQL migrations. If Java-based Flyway migrations are added None of these update user-entered data based on assumptions about TSV-loaded state. Schema changes, uniform sanitization, and idempotent renames only. ✅ -#### Requirement for future migrations +##### Requirement for future migrations Any future migration that updates user data based on metadata-block state **must** explicitly verify its expected starting state and fail loudly if the state is absent or unexpected. Two patterns to be aware of: @@ -140,7 +144,7 @@ The same pattern applies to `controlledvocabularyvalue` (e.g., verifying `strval Both check patterns protect all upgrade paths — including this tool, release-by-release upgrades, and installations where admins have forgotten a TSV reload or have diverged locally from upstream definitions. -#### Locally customized upstream metadata blocks +##### Locally customized upstream metadata blocks For customized upstream metadata blocks (e.g., a modified `citation.tsv`), the risks are the same as with release-by-release upgrades: the next TSV reload overwrites local customizations. @@ -154,12 +158,18 @@ Admins with local customizations should diff their TSVs against upstream before - Keep in mind that this recipe is a practical migration aid, not an official Dataverse-supported upgrade method - Validate the migrated database before using it further in any environment + + + ## Installation instructions 🔧 1. Clone this recipe repository and change your working directory to `java/update-by-flyway`. 2. Make sure Docker is running. 3. Place your PostgreSQL dump where the recipe can read it or override the dump file location with a Maven property (see below). + + + ## Usage examples 💻 ### Basic usage @@ -244,6 +254,17 @@ At the moment, this migration mainly serves as a detection and guidance step. It checks for affected metadata values and emits notices explaining the situation. This is useful when reviewing upgrade issues around Dataverse 6.3 and related metadata handling. +### Run additional, local migrations + +In case you are migrating from a fork back to upstream code, you might want to add additional data migrations. +You may put these in a folder and point to it by Maven property: + +```bash +mvn install -Dmigrate.local=path/to/your/local/migrations +``` + + + ## Important Maven properties ⚙️ These are the most useful properties to override when running the recipe. @@ -305,6 +326,9 @@ These are the most useful properties to override when running the recipe. - `migrate.keywordTermUri` Enables handling related to keyword term URI migration checks (default: `false`) +- `migrate.local` + Point to directory with additional, local migration scripts. Skipped if it does not exist. (default: `${project.basedir}/local`) + ### Docker execution control These are mostly useful for debugging or partial reruns: @@ -314,6 +338,9 @@ These are mostly useful for debugging or partial reruns: - `docker.skipDump` - `docker.skipStop` + + + ## Dependencies 📦 This recipe depends on you to provide: @@ -331,6 +358,9 @@ It will automatically pull in these dependencies: It also depends on access to the Dataverse Git repository, so the main migration scripts for the configured tag can be checked out locally. + + + ## Support 💬 For issues and questions, please open an issue in this repository or discuss on the Dataverse Zulip community channels if appropriate. diff --git a/java/update-by-flyway/pom.xml b/java/update-by-flyway/pom.xml index 379ce98..a7e5799 100644 --- a/java/update-by-flyway/pom.xml +++ b/java/update-by-flyway/pom.xml @@ -51,6 +51,9 @@ 30 + ${project.basedir}/extra-migrations + ${project.basedir}/local + false @@ -69,6 +72,7 @@ tag ${repo.tag} true + true @@ -278,7 +282,8 @@ true filesystem:${repo.directory}/${repo.subpath} - filesystem:extra-migrations + filesystem:${migrate.extras} + filesystem:${migrate.local} ${migrate.keywordTermUri} From 6b226193f1b94f6b678184dcabd9ca65e5a578eb Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Tue, 28 Apr 2026 16:39:22 +0200 Subject: [PATCH 06/11] build,docs(java,flyway): update PostgreSQL default wait time to 600s Most production databases are huge, so you'll need a larger timeout. Doc hints are always useful. --- java/update-by-flyway/README.md | 6 +++++- java/update-by-flyway/pom.xml | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/java/update-by-flyway/README.md b/java/update-by-flyway/README.md index f076ae7..a8d8df2 100644 --- a/java/update-by-flyway/README.md +++ b/java/update-by-flyway/README.md @@ -187,6 +187,10 @@ This will: - apply Flyway migrations - apply the required extra migrations shipped with this recipe - export the migrated database as `target/migrated_db_dump.sql` +- stop the container and remove it + +Note: for most production setups, the database dump is quite huge. +Make sure to inspect the `postgresql.waitForSec` to allow for ample time of restoring the DB from the dump. ### Cleanup @@ -316,7 +320,7 @@ These are the most useful properties to override when running the recipe. Database password (default: `supersecret`) - `postgresql.waitForSec` - Time to wait for PostgreSQL startup and dump operations (default: `30`) + Time to wait for PostgreSQL startup and dump operations (default: `600` = 10 minutes) ### Migration behavior flags diff --git a/java/update-by-flyway/pom.xml b/java/update-by-flyway/pom.xml index a7e5799..14d521f 100644 --- a/java/update-by-flyway/pom.xml +++ b/java/update-by-flyway/pom.xml @@ -48,8 +48,8 @@ ${project.basedir}/db_dump.sql sql ${project.build.directory} - - 30 + + 600 ${project.basedir}/extra-migrations ${project.basedir}/local From dd76147dcf0d1a3255c5626e2d0cf655b53ad55d Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Tue, 28 Apr 2026 16:40:45 +0200 Subject: [PATCH 07/11] build(java,flyway): add PostgreSQL custom command to increase WAL size and checkpoint timeout Due to the huge database dump we want to restore, the defaults are way to small. Setting this speeds things up some. --- java/update-by-flyway/pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/java/update-by-flyway/pom.xml b/java/update-by-flyway/pom.xml index 14d521f..fc67a94 100644 --- a/java/update-by-flyway/pom.xml +++ b/java/update-by-flyway/pom.xml @@ -45,6 +45,7 @@ dataverse ${postgresql.db} supersecret + postgres -c max_wal_size=8GB -c checkpoint_timeout=30min -c checkpoint_completion_target=0.9 ${project.basedir}/db_dump.sql sql ${project.build.directory} @@ -99,6 +100,9 @@ true + + ${postgresql.command} + custom migration From 0181fcca07008a988c1abcd8ee15541c551425ea Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Tue, 28 Apr 2026 16:40:55 +0200 Subject: [PATCH 08/11] docs(java,flyway): update README with non-blocking database exploration instructions for migrations --- java/update-by-flyway/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/java/update-by-flyway/README.md b/java/update-by-flyway/README.md index a8d8df2..6fef238 100644 --- a/java/update-by-flyway/README.md +++ b/java/update-by-flyway/README.md @@ -199,6 +199,22 @@ Stop containers and clean generated files: mvn clean ``` +### Exploring + +If you want to examine the database contents before or after the migration without stopping, you have two choices. + +Only import the dump, then wait (non-blocking!): +```bash +mvn prepare-package +``` + +Migrate, then wait (non-blocking!): +```bash +mvn package +``` + +The container will listen on `${postgresql.host}:${postgresql.port}`, defaulting to `localhost:15432`. + ### Use a different Dataverse tag To migrate using another Dataverse release tag: From ed8b2a1f0cbc6766ea7ca60e354efb4619ecd942 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Tue, 28 Apr 2026 22:51:33 +0200 Subject: [PATCH 09/11] fix(java,flyway): wrap EJB timer table name in quotes to drop the table --- .../extra-migrations/V4.20.0.6__remove-ejb-timer.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql b/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql index de29aa0..3cb83b3 100644 --- a/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql +++ b/java/update-by-flyway/extra-migrations/V4.20.0.6__remove-ejb-timer.sql @@ -1,2 +1,2 @@ -- -DROP TABLE IF EXISTS EJB__TIMER__TBL; +DROP TABLE IF EXISTS "EJB__TIMER__TBL"; From 653b31be338d86c35f94fb0f7dbb8c428c83cdb7 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Thu, 14 May 2026 10:38:03 +0200 Subject: [PATCH 10/11] chore: avoid number collisions with main repo afterMigrate scripts --- ...ql => afterMigrate__1000-8191-check-restricted-file-terms.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename java/update-by-flyway/extra-migrations/{afterMigrate__2-8191-check-restricted-file-terms.sql => afterMigrate__1000-8191-check-restricted-file-terms.sql} (100%) diff --git a/java/update-by-flyway/extra-migrations/afterMigrate__2-8191-check-restricted-file-terms.sql b/java/update-by-flyway/extra-migrations/afterMigrate__1000-8191-check-restricted-file-terms.sql similarity index 100% rename from java/update-by-flyway/extra-migrations/afterMigrate__2-8191-check-restricted-file-terms.sql rename to java/update-by-flyway/extra-migrations/afterMigrate__1000-8191-check-restricted-file-terms.sql From 192768509195bccc0d73c1a56cbbb7c99c4c1fd7 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Thu, 14 May 2026 10:38:28 +0200 Subject: [PATCH 11/11] chore: add local dir and gitignores --- java/update-by-flyway/.gitignore | 6 +++++- java/update-by-flyway/local/.gitkeep | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 java/update-by-flyway/local/.gitkeep diff --git a/java/update-by-flyway/.gitignore b/java/update-by-flyway/.gitignore index 4352e10..9a7b13c 100644 --- a/java/update-by-flyway/.gitignore +++ b/java/update-by-flyway/.gitignore @@ -1,3 +1,7 @@ .flattened-pom.xml target/ -/*.sql \ No newline at end of file + +# Exclude dumps that may be around here +/*.sql +# Don't keep track of local extra migrations +/local/*.sql diff --git a/java/update-by-flyway/local/.gitkeep b/java/update-by-flyway/local/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/java/update-by-flyway/local/.gitkeep @@ -0,0 +1 @@ +