diff --git a/app/controllers/HaplogroupTreeMergeController.scala b/app/controllers/HaplogroupTreeMergeController.scala new file mode 100644 index 0000000..41328ab --- /dev/null +++ b/app/controllers/HaplogroupTreeMergeController.scala @@ -0,0 +1,130 @@ +package controllers + +import actions.ApiSecurityAction +import jakarta.inject.{Inject, Singleton} +import models.api.haplogroups.* +import play.api.Logger +import play.api.libs.json.Json +import play.api.mvc.{Action, BaseController, ControllerComponents} +import services.HaplogroupTreeMergeService + +import scala.concurrent.ExecutionContext + +/** + * API controller for haplogroup tree merge operations. + * Secured with X-API-Key authentication. + * + * Endpoints: + * - POST /api/v1/manage/haplogroups/merge - Full tree merge + * - POST /api/v1/manage/haplogroups/merge/subtree - Subtree merge under anchor + * - POST /api/v1/manage/haplogroups/merge/preview - Preview merge without changes + */ +@Singleton +class HaplogroupTreeMergeController @Inject()( + val controllerComponents: ControllerComponents, + secureApi: ApiSecurityAction, + mergeService: HaplogroupTreeMergeService +)(implicit ec: ExecutionContext) extends BaseController { + + private val logger = Logger(this.getClass) + + /** + * Merge a full haplogroup tree, replacing the existing tree for the given type. + * + * Request body: TreeMergeRequest + * - haplogroupType: "Y" or "MT" + * - sourceTree: Nested PhyloNodeInput tree structure + * - sourceName: Attribution source (e.g., "ytree.net", "ISOGG") + * - priorityConfig: Optional source priority ordering + * - conflictStrategy: Optional conflict resolution strategy + * - dryRun: If true, simulates merge without applying changes + */ + def mergeFullTree(): Action[TreeMergeRequest] = + secureApi.jsonAction[TreeMergeRequest].async { request => + logger.info(s"API: Full tree merge for ${request.body.haplogroupType} from ${request.body.sourceName}" + + (if (request.body.dryRun) " (dry run)" else "")) + + mergeService.mergeFullTree(request.body).map { response => + if (response.success) { + Ok(Json.toJson(response)) + } else { + BadRequest(Json.toJson(response)) + } + }.recover { case e: Exception => + logger.error(s"Tree merge failed: ${e.getMessage}", e) + InternalServerError(Json.obj( + "success" -> false, + "message" -> "Merge operation failed", + "errors" -> List(e.getMessage) + )) + } + } + + /** + * Merge a subtree under a specific anchor haplogroup. + * + * Request body: SubtreeMergeRequest + * - haplogroupType: "Y" or "MT" + * - anchorHaplogroupName: Name of the haplogroup to merge under + * - sourceTree: Nested PhyloNodeInput tree structure + * - sourceName: Attribution source + * - priorityConfig: Optional source priority ordering + * - conflictStrategy: Optional conflict resolution strategy + * - dryRun: If true, simulates merge without applying changes + */ + def mergeSubtree(): Action[SubtreeMergeRequest] = + secureApi.jsonAction[SubtreeMergeRequest].async { request => + logger.info(s"API: Subtree merge under ${request.body.anchorHaplogroupName} " + + s"for ${request.body.haplogroupType} from ${request.body.sourceName}" + + (if (request.body.dryRun) " (dry run)" else "")) + + mergeService.mergeSubtree(request.body).map { response => + if (response.success) { + Ok(Json.toJson(response)) + } else { + BadRequest(Json.toJson(response)) + } + }.recover { + case e: IllegalArgumentException => + logger.warn(s"Subtree merge validation failed: ${e.getMessage}") + BadRequest(Json.obj( + "success" -> false, + "message" -> e.getMessage, + "errors" -> List(e.getMessage) + )) + case e: Exception => + logger.error(s"Subtree merge failed: ${e.getMessage}", e) + InternalServerError(Json.obj( + "success" -> false, + "message" -> "Merge operation failed", + "errors" -> List(e.getMessage) + )) + } + } + + /** + * Preview a merge operation without applying changes. + * + * Request body: MergePreviewRequest + * - haplogroupType: "Y" or "MT" + * - anchorHaplogroupName: Optional anchor for subtree preview + * - sourceTree: Nested PhyloNodeInput tree structure + * - sourceName: Attribution source + * - priorityConfig: Optional source priority ordering + */ + def previewMerge(): Action[MergePreviewRequest] = + secureApi.jsonAction[MergePreviewRequest].async { request => + logger.info(s"API: Preview merge for ${request.body.haplogroupType} from ${request.body.sourceName}" + + request.body.anchorHaplogroupName.map(a => s" under $a").getOrElse("")) + + mergeService.previewMerge(request.body).map { response => + Ok(Json.toJson(response)) + }.recover { case e: Exception => + logger.error(s"Merge preview failed: ${e.getMessage}", e) + InternalServerError(Json.obj( + "error" -> "Preview operation failed", + "details" -> e.getMessage + )) + } + } +} diff --git a/app/controllers/TreeController.scala b/app/controllers/TreeController.scala index e175c83..1de184b 100644 --- a/app/controllers/TreeController.scala +++ b/app/controllers/TreeController.scala @@ -4,6 +4,7 @@ import config.FeatureFlags import models.HaplogroupType import models.HaplogroupType.{MT, Y} import models.api.{SubcladeDTO, TreeNodeDTO} +import models.domain.haplogroups.HaplogroupProvenance import models.view.TreeViewModel import org.webjars.play.WebJarsUtil import play.api.cache.{AsyncCacheApi, Cached} @@ -272,8 +273,9 @@ class TreeController @Inject()(val controllerComponents: MessagesControllerCompo } def getSnpDetailSidebar(haplogroupName: String, haplogroupType: HaplogroupType): Action[AnyContent] = Action.async { implicit request => - treeService.findVariantsForHaplogroup(haplogroupName, haplogroupType).map { snps => - Ok(views.html.fragments.snpDetailSidebar(haplogroupName, snps)) + treeService.findHaplogroupWithVariants(haplogroupName, haplogroupType).map { case (haplogroup, snps) => + val provenance = haplogroup.flatMap(_.provenance) + Ok(views.html.fragments.snpDetailSidebar(haplogroupName, snps, provenance)) } } diff --git a/app/models/HaplogroupType.scala b/app/models/HaplogroupType.scala index 4949cb6..e35d415 100644 --- a/app/models/HaplogroupType.scala +++ b/app/models/HaplogroupType.scala @@ -1,5 +1,6 @@ package models +import play.api.libs.json.{Format, Reads, Writes} import play.api.mvc.QueryStringBindable /** @@ -33,6 +34,15 @@ object HaplogroupType { case _ => None } + // JSON serialization + implicit val reads: Reads[HaplogroupType] = Reads.StringReads.map { str => + fromString(str).getOrElse(throw new IllegalArgumentException(s"Invalid HaplogroupType: $str")) + } + + implicit val writes: Writes[HaplogroupType] = Writes.StringWrites.contramap(_.toString) + + implicit val format: Format[HaplogroupType] = Format(reads, writes) + implicit val queryStringBindable: QueryStringBindable[HaplogroupType] = new QueryStringBindable[HaplogroupType] { def bind(key: String, params: Map[String, Seq[String]]): Option[Either[String, HaplogroupType]] = { diff --git a/app/models/api/haplogroups/TreeMergeModels.scala b/app/models/api/haplogroups/TreeMergeModels.scala new file mode 100644 index 0000000..1c09bd3 --- /dev/null +++ b/app/models/api/haplogroups/TreeMergeModels.scala @@ -0,0 +1,253 @@ +package models.api.haplogroups + +import models.HaplogroupType +import play.api.libs.json.{Format, Json, OFormat, Reads, Writes} + +/** + * API DTOs for Haplogroup Tree Merge operations. + * + * Supports merging external haplogroup trees from sources like ISOGG, ytree.net, + * and other researchers into the DecodingUs baseline tree. + */ + +// ============================================================================ +// Input Tree Structure +// ============================================================================ + +/** + * A variant with its primary name and optional aliases. + * Aliases represent alternative names for the same SNP from different labs/sources. + * Example: M207 (primary) with aliases Page37, UTY2 + */ +case class VariantInput( + name: String, + aliases: List[String] = List.empty +) + +object VariantInput { + implicit val format: OFormat[VariantInput] = Json.format[VariantInput] +} + +/** + * A node in the input phylogenetic tree for merging. + * Matching is done by variants, not names, to handle different naming conventions. + */ +case class PhyloNodeInput( + name: String, + variants: List[VariantInput] = List.empty, + formedYbp: Option[Int] = None, + formedYbpLower: Option[Int] = None, + formedYbpUpper: Option[Int] = None, + tmrcaYbp: Option[Int] = None, + tmrcaYbpLower: Option[Int] = None, + tmrcaYbpUpper: Option[Int] = None, + children: List[PhyloNodeInput] = List.empty +) + +object PhyloNodeInput { + implicit val format: OFormat[PhyloNodeInput] = Json.format[PhyloNodeInput] +} + +// ============================================================================ +// Merge Configuration +// ============================================================================ + +/** + * Configuration for source priority during merge. + * Lower index = higher priority. + */ +case class SourcePriorityConfig( + sourcePriorities: List[String], + defaultPriority: Int = 100 +) + +object SourcePriorityConfig { + implicit val format: OFormat[SourcePriorityConfig] = Json.format[SourcePriorityConfig] +} + +/** + * Strategy for handling conflicts during merge. + */ +sealed trait ConflictStrategy + +object ConflictStrategy { + case object HigherPriorityWins extends ConflictStrategy + case object KeepExisting extends ConflictStrategy + case object AlwaysUpdate extends ConflictStrategy + + implicit val reads: Reads[ConflictStrategy] = Reads.StringReads.map { + case "higher_priority_wins" => HigherPriorityWins + case "keep_existing" => KeepExisting + case "always_update" => AlwaysUpdate + case other => throw new IllegalArgumentException(s"Unknown conflict strategy: $other") + } + + implicit val writes: Writes[ConflictStrategy] = Writes.StringWrites.contramap { + case HigherPriorityWins => "higher_priority_wins" + case KeepExisting => "keep_existing" + case AlwaysUpdate => "always_update" + } + + implicit val format: Format[ConflictStrategy] = Format(reads, writes) +} + +// ============================================================================ +// Request DTOs +// ============================================================================ + +/** + * Request for full tree merge (replace entire Y-DNA or mtDNA tree). + */ +case class TreeMergeRequest( + haplogroupType: HaplogroupType, + sourceTree: PhyloNodeInput, + sourceName: String, + priorityConfig: Option[SourcePriorityConfig] = None, + conflictStrategy: Option[ConflictStrategy] = None, + dryRun: Boolean = false +) + +object TreeMergeRequest { + implicit val format: OFormat[TreeMergeRequest] = Json.format[TreeMergeRequest] +} + +/** + * Request for subtree merge (merge under a specific anchor node). + */ +case class SubtreeMergeRequest( + haplogroupType: HaplogroupType, + anchorHaplogroupName: String, + sourceTree: PhyloNodeInput, + sourceName: String, + priorityConfig: Option[SourcePriorityConfig] = None, + conflictStrategy: Option[ConflictStrategy] = None, + dryRun: Boolean = false +) + +object SubtreeMergeRequest { + implicit val format: OFormat[SubtreeMergeRequest] = Json.format[SubtreeMergeRequest] +} + +/** + * Request for merge preview. + */ +case class MergePreviewRequest( + haplogroupType: HaplogroupType, + anchorHaplogroupName: Option[String] = None, + sourceTree: PhyloNodeInput, + sourceName: String, + priorityConfig: Option[SourcePriorityConfig] = None +) + +object MergePreviewRequest { + implicit val format: OFormat[MergePreviewRequest] = Json.format[MergePreviewRequest] +} + +// ============================================================================ +// Response DTOs +// ============================================================================ + +/** + * Statistics from a merge operation. + */ +case class MergeStatistics( + nodesProcessed: Int, + nodesCreated: Int, + nodesUpdated: Int, + nodesUnchanged: Int, + variantsAdded: Int, + variantsUpdated: Int, + relationshipsCreated: Int, + relationshipsUpdated: Int, + splitOperations: Int = 0 +) + +object MergeStatistics { + implicit val format: OFormat[MergeStatistics] = Json.format[MergeStatistics] + + val empty: MergeStatistics = MergeStatistics(0, 0, 0, 0, 0, 0, 0, 0, 0) + + def combine(a: MergeStatistics, b: MergeStatistics): MergeStatistics = MergeStatistics( + nodesProcessed = a.nodesProcessed + b.nodesProcessed, + nodesCreated = a.nodesCreated + b.nodesCreated, + nodesUpdated = a.nodesUpdated + b.nodesUpdated, + nodesUnchanged = a.nodesUnchanged + b.nodesUnchanged, + variantsAdded = a.variantsAdded + b.variantsAdded, + variantsUpdated = a.variantsUpdated + b.variantsUpdated, + relationshipsCreated = a.relationshipsCreated + b.relationshipsCreated, + relationshipsUpdated = a.relationshipsUpdated + b.relationshipsUpdated, + splitOperations = a.splitOperations + b.splitOperations + ) +} + +/** + * Details of a conflict encountered during merge. + */ +case class MergeConflict( + haplogroupName: String, + field: String, + existingValue: String, + newValue: String, + resolution: String, + existingSource: String, + newSource: String +) + +object MergeConflict { + implicit val format: OFormat[MergeConflict] = Json.format[MergeConflict] +} + +/** + * Details of a split operation performed during merge. + */ +case class SplitOperation( + parentName: String, + newIntermediateName: String, + variantsRedistributed: List[String], + childrenReassigned: List[String], + source: String +) + +object SplitOperation { + implicit val format: OFormat[SplitOperation] = Json.format[SplitOperation] +} + +/** + * Result of a merge operation. + */ +case class TreeMergeResponse( + success: Boolean, + message: String, + statistics: MergeStatistics, + conflicts: List[MergeConflict] = List.empty, + splits: List[SplitOperation] = List.empty, + errors: List[String] = List.empty +) + +object TreeMergeResponse { + implicit val format: OFormat[TreeMergeResponse] = Json.format[TreeMergeResponse] + + def failure(message: String, errors: List[String] = List.empty): TreeMergeResponse = + TreeMergeResponse( + success = false, + message = message, + statistics = MergeStatistics.empty, + errors = errors + ) +} + +/** + * Preview of merge results (without applying changes). + */ +case class MergePreviewResponse( + statistics: MergeStatistics, + conflicts: List[MergeConflict], + splits: List[SplitOperation], + newNodes: List[String], + updatedNodes: List[String], + unchangedNodes: List[String] +) + +object MergePreviewResponse { + implicit val format: OFormat[MergePreviewResponse] = Json.format[MergePreviewResponse] +} diff --git a/app/models/dal/MyPostgresProfile.scala b/app/models/dal/MyPostgresProfile.scala index fea3530..36b675c 100644 --- a/app/models/dal/MyPostgresProfile.scala +++ b/app/models/dal/MyPostgresProfile.scala @@ -276,7 +276,8 @@ trait MyPostgresProfile extends ExPostgresProfile case None => JsNull }, { jsValue => - if (jsValue == JsNull || (jsValue.isInstanceOf[JsObject] && jsValue.as[JsObject].value.isEmpty)) None + // Handle database NULL (Java null), JSON null, or empty object + if (jsValue == null || jsValue == JsNull || (jsValue.isInstanceOf[JsObject] && jsValue.as[JsObject].value.isEmpty)) None else Some(jsValue.as[IdentityVerification]) } ) @@ -290,7 +291,8 @@ trait MyPostgresProfile extends ExPostgresProfile case None => JsNull }, { jsValue => - if (jsValue == JsNull || (jsValue.isInstanceOf[JsObject] && jsValue.as[JsObject].value.isEmpty)) None + // Handle database NULL (Java null), JSON null, or empty object + if (jsValue == null || jsValue == JsNull || (jsValue.isInstanceOf[JsObject] && jsValue.as[JsObject].value.isEmpty)) None else Some(jsValue.as[ManualOverride]) } ) @@ -304,12 +306,21 @@ trait MyPostgresProfile extends ExPostgresProfile case None => JsNull }, { jsValue => - if (jsValue == JsNull) None + // Handle database NULL (Java null) or JSON null + if (jsValue == null || jsValue == JsNull) None else Some(jsValue.as[Seq[AuditEntry]]) } ) } + // --- Haplogroup Provenance JSONB Type Mapper --- + // Maps HaplogroupProvenance directly to JsValue. For nullable columns, use column[Option[HaplogroupProvenance]] + // and Slick will handle NULL automatically. + import models.domain.haplogroups.HaplogroupProvenance + + implicit val haplogroupProvenanceJsonbTypeMapper: JdbcType[HaplogroupProvenance] with BaseTypedType[HaplogroupProvenance] = + MappedJdbcType.base[HaplogroupProvenance, JsValue](Json.toJson(_), _.as[HaplogroupProvenance]) + // Declare the name of an aggregate function: val ArrayAgg = new SqlAggregateFunction("array_agg") diff --git a/app/models/dal/domain/haplogroups/HaplogroupsTable.scala b/app/models/dal/domain/haplogroups/HaplogroupsTable.scala index 1a71964..ab8e018 100644 --- a/app/models/dal/domain/haplogroups/HaplogroupsTable.scala +++ b/app/models/dal/domain/haplogroups/HaplogroupsTable.scala @@ -2,7 +2,7 @@ package models.dal.domain.haplogroups import models.HaplogroupType import models.dal.MyPostgresProfile.api.* -import models.domain.haplogroups.Haplogroup +import models.domain.haplogroups.{Haplogroup, HaplogroupProvenance} import slick.ast.TypedType import slick.lifted.{MappedProjection, ProvenShape} @@ -71,8 +71,11 @@ class HaplogroupsTable(tag: Tag) extends Table[Haplogroup](tag, Some("tree"), "h def ageEstimateSource = column[Option[String]]("age_estimate_source") + // Multi-source provenance tracking (JSONB) + def provenance = column[Option[HaplogroupProvenance]]("provenance") + def * = ( haplogroupId.?, name, lineage, description, haplogroupType, revisionId, source, confidenceLevel, validFrom, validUntil, - formedYbp, formedYbpLower, formedYbpUpper, tmrcaYbp, tmrcaYbpLower, tmrcaYbpUpper, ageEstimateSource + formedYbp, formedYbpLower, formedYbpUpper, tmrcaYbp, tmrcaYbpLower, tmrcaYbpUpper, ageEstimateSource, provenance ).mapTo[Haplogroup] } diff --git a/app/models/domain/haplogroups/Haplogroup.scala b/app/models/domain/haplogroups/Haplogroup.scala index 7b0ce5a..d177ffe 100644 --- a/app/models/domain/haplogroups/Haplogroup.scala +++ b/app/models/domain/haplogroups/Haplogroup.scala @@ -80,7 +80,8 @@ case class Haplogroup( tmrcaYbp: Option[Int] = None, tmrcaYbpLower: Option[Int] = None, tmrcaYbpUpper: Option[Int] = None, - ageEstimateSource: Option[String] = None + ageEstimateSource: Option[String] = None, + provenance: Option[HaplogroupProvenance] = None ) { /** Get formed date as AgeEstimate if available */ def formedEstimate: Option[AgeEstimate] = formedYbp.map(y => AgeEstimate(y, formedYbpLower, formedYbpUpper)) diff --git a/app/models/domain/haplogroups/HaplogroupProvenance.scala b/app/models/domain/haplogroups/HaplogroupProvenance.scala new file mode 100644 index 0000000..cb98a14 --- /dev/null +++ b/app/models/domain/haplogroups/HaplogroupProvenance.scala @@ -0,0 +1,102 @@ +package models.domain.haplogroups + +import play.api.libs.json.{Json, OFormat, Format, Reads, Writes} + +import java.time.LocalDateTime + +/** + * Tracks the provenance of a haplogroup node and its variants from multiple sources. + * + * Credit assignment follows a tiered model: + * - ISOGG credit is preserved on existing nodes (authoritative backbone) + * - Incoming sources get credit for new splits and terminal branches they contribute + * + * @param primaryCredit Source with primary discovery credit for this node + * @param nodeProvenance All sources that have contributed to this node's existence + * @param variantProvenance Per-variant source attribution (variant name -> set of sources) + * @param lastMergedAt Timestamp of the most recent merge operation affecting this node + * @param lastMergedFrom Source of the most recent merge operation + */ +case class HaplogroupProvenance( + primaryCredit: String, + nodeProvenance: Set[String] = Set.empty, + variantProvenance: Map[String, Set[String]] = Map.empty, + lastMergedAt: Option[LocalDateTime] = None, + lastMergedFrom: Option[String] = None +) { + + /** + * Add a source to nodeProvenance. + */ + def addNodeSource(source: String): HaplogroupProvenance = + copy(nodeProvenance = nodeProvenance + source) + + /** + * Add a source attribution for a specific variant. + */ + def addVariantSource(variantName: String, source: String): HaplogroupProvenance = + copy(variantProvenance = variantProvenance.updatedWith(variantName) { + case Some(sources) => Some(sources + source) + case None => Some(Set(source)) + }) + + /** + * Merge another provenance record into this one, combining all sources. + */ + def merge(other: HaplogroupProvenance): HaplogroupProvenance = { + val mergedVariants = (variantProvenance.keySet ++ other.variantProvenance.keySet).map { key => + key -> (variantProvenance.getOrElse(key, Set.empty) ++ other.variantProvenance.getOrElse(key, Set.empty)) + }.toMap + + HaplogroupProvenance( + primaryCredit = this.primaryCredit, // Preserve existing primary credit + nodeProvenance = nodeProvenance ++ other.nodeProvenance, + variantProvenance = mergedVariants, + lastMergedAt = Seq(lastMergedAt, other.lastMergedAt).flatten.maxOption, + lastMergedFrom = other.lastMergedFrom.orElse(lastMergedFrom) + ) + } + + /** + * Update merge timestamp and source. + */ + def withMergeInfo(source: String, timestamp: LocalDateTime): HaplogroupProvenance = + copy(lastMergedAt = Some(timestamp), lastMergedFrom = Some(source)) +} + +object HaplogroupProvenance { + // Custom JSON format to handle Set[String] and Map[String, Set[String]] + implicit val setStringFormat: Format[Set[String]] = Format( + Reads.seq[String].map(_.toSet), + Writes.seq[String].contramap(_.toSeq) + ) + + implicit val mapStringSetFormat: Format[Map[String, Set[String]]] = Format( + Reads.map[Set[String]], + Writes.map[Set[String]] + ) + + implicit val format: OFormat[HaplogroupProvenance] = Json.format[HaplogroupProvenance] + + val empty: HaplogroupProvenance = HaplogroupProvenance(primaryCredit = "") + + /** + * Create initial provenance for a new node from a source. + */ + def forNewNode(source: String, variants: Seq[String] = Seq.empty): HaplogroupProvenance = { + val variantProv = variants.map(v => v -> Set(source)).toMap + HaplogroupProvenance( + primaryCredit = source, + nodeProvenance = Set(source), + variantProvenance = variantProv, + lastMergedAt = Some(LocalDateTime.now()), + lastMergedFrom = Some(source) + ) + } + + /** + * Determine if ISOGG credit should be preserved (returns true if existing credit is ISOGG). + */ + def shouldPreserveCredit(existingCredit: String): Boolean = + existingCredit.equalsIgnoreCase("ISOGG") +} diff --git a/app/modules/ServicesModule.scala b/app/modules/ServicesModule.scala index 7e83b97..f79ee3d 100644 --- a/app/modules/ServicesModule.scala +++ b/app/modules/ServicesModule.scala @@ -28,6 +28,7 @@ class ServicesModule(environment: Environment, configuration: Configuration) ext bind(classOf[services.PublicationDiscoveryService]).asEagerSingleton() bind(classOf[services.UserPermissionHelper]).asEagerSingleton() + bind(classOf[services.HaplogroupTreeMergeService]).asEagerSingleton() } } diff --git a/app/repositories/HaplogroupCoreRepository.scala b/app/repositories/HaplogroupCoreRepository.scala index d3dd40c..ca3e822 100644 --- a/app/repositories/HaplogroupCoreRepository.scala +++ b/app/repositories/HaplogroupCoreRepository.scala @@ -2,7 +2,7 @@ package repositories import jakarta.inject.Inject import models.HaplogroupType -import models.domain.haplogroups.Haplogroup +import models.domain.haplogroups.{Haplogroup, HaplogroupProvenance} import play.api.Logging import play.api.db.slick.DatabaseConfigProvider import slick.jdbc.GetResult @@ -117,6 +117,26 @@ trait HaplogroupCoreRepository { * @return a sequence of root haplogroups for that type */ def findRoots(haplogroupType: HaplogroupType): Future[Seq[Haplogroup]] + + // === Tree Merge Methods === + + /** + * Update the provenance field for a haplogroup. + * + * @param id the haplogroup ID + * @param provenance the new provenance data + * @return true if updated successfully + */ + def updateProvenance(id: Int, provenance: HaplogroupProvenance): Future[Boolean] + + /** + * Get all haplogroups of a type with their associated variant names. + * Used for building variant-based lookup index for merge operations. + * + * @param haplogroupType the type of haplogroup (Y or MT) + * @return sequence of tuples: (haplogroup, list of variant names) + */ + def getAllWithVariantNames(haplogroupType: HaplogroupType): Future[Seq[(Haplogroup, Seq[String])]] } class HaplogroupCoreRepositoryImpl @Inject()( @@ -413,4 +433,40 @@ class HaplogroupCoreRepositoryImpl @Inject()( runQuery(query) } + + // === Tree Merge Methods Implementation === + + override def updateProvenance(id: Int, provenance: HaplogroupProvenance): Future[Boolean] = { + runQuery( + haplogroups + .filter(_.haplogroupId === id) + .map(_.provenance) + .update(Some(provenance)) + ).map(_ > 0) + } + + override def getAllWithVariantNames(haplogroupType: HaplogroupType): Future[Seq[(Haplogroup, Seq[String])]] = { + import models.dal.DatabaseSchema.domain.haplogroups.haplogroupVariants + import models.dal.DatabaseSchema.domain.genomics.variants + + // Query haplogroups with their associated variant names via join + val query = for { + hg <- activeHaplogroups.filter(_.haplogroupType === haplogroupType) + } yield hg + + runQuery(query.result).flatMap { hgList => + // For each haplogroup, fetch its variant names (using commonName from Variant table) + val futures = hgList.map { hg => + val variantQuery = for { + hv <- haplogroupVariants.filter(_.haplogroupId === hg.id.get) + v <- variants.filter(_.variantId === hv.variantId) + } yield v.commonName + + runQuery(variantQuery.result).map { variantNames => + (hg, variantNames.flatten) // Filter out None values + } + } + Future.sequence(futures) + } + } } diff --git a/app/repositories/HaplogroupVariantRepository.scala b/app/repositories/HaplogroupVariantRepository.scala index 66477a4..68368ee 100644 --- a/app/repositories/HaplogroupVariantRepository.scala +++ b/app/repositories/HaplogroupVariantRepository.scala @@ -174,9 +174,12 @@ class HaplogroupVariantRepositoryImpl @Inject()( } override def addVariantToHaplogroup(haplogroupId: Int, variantId: Int): Future[Int] = { - val insertion = (haplogroupVariants returning haplogroupVariants.map(_.haplogroupVariantId)) += - HaplogroupVariant(None, haplogroupId, variantId) - runQuery(insertion) + val insertAction = sqlu""" + INSERT INTO haplogroup_variant (haplogroup_id, variant_id) + VALUES ($haplogroupId, $variantId) + ON CONFLICT (haplogroup_id, variant_id) DO NOTHING + """ + runQuery(insertAction) } def removeVariantFromHaplogroup(haplogroupId: Int, variantId: Int): Future[Int] = { diff --git a/app/services/HaplogroupTreeMergeService.scala b/app/services/HaplogroupTreeMergeService.scala new file mode 100644 index 0000000..955c9d6 --- /dev/null +++ b/app/services/HaplogroupTreeMergeService.scala @@ -0,0 +1,670 @@ +package services + +import jakarta.inject.{Inject, Singleton} +import models.HaplogroupType +import models.api.haplogroups.* +import models.dal.domain.genomics.VariantAlias +import models.domain.haplogroups.{Haplogroup, HaplogroupProvenance} +import play.api.Logging +import repositories.{HaplogroupCoreRepository, HaplogroupVariantRepository, VariantAliasRepository, VariantRepository} + +import java.time.LocalDateTime +import scala.concurrent.{ExecutionContext, Future} + +/** + * Service for merging external haplogroup trees into the DecodingUs baseline tree. + * + * Key features: + * - Variant-based matching: Nodes are matched by their defining variants, not names, + * to handle different naming conventions across sources (ytree.net, ISOGG, researchers) + * - Credit assignment: ISOGG credit preserved on existing nodes; incoming sources get + * credit for new splits and terminal branches they contribute + * - Multi-source provenance: Full attribution tracking via JSONB column + * - Branch split detection: Identifies when incoming data reveals finer tree structure + */ +@Singleton +class HaplogroupTreeMergeService @Inject()( + haplogroupRepository: HaplogroupCoreRepository, + haplogroupVariantRepository: HaplogroupVariantRepository, + variantRepository: VariantRepository, + variantAliasRepository: VariantAliasRepository +)(implicit ec: ExecutionContext) extends Logging { + + // ============================================================================ + // Helper methods for VariantInput + // ============================================================================ + + /** Extract all variant names (primary + aliases) from a VariantInput */ + private def allVariantNames(variant: VariantInput): List[String] = + variant.name :: variant.aliases + + /** Extract all variant names from a list of VariantInput */ + private def allVariantNames(variants: List[VariantInput]): List[String] = + variants.flatMap(allVariantNames) + + /** Extract just the primary variant names from a list of VariantInput */ + private def primaryVariantNames(variants: List[VariantInput]): List[String] = + variants.map(_.name) + + // ============================================================================ + // Public API + // ============================================================================ + + /** + * Merge a full tree, replacing the existing tree for the given haplogroup type. + */ + def mergeFullTree(request: TreeMergeRequest): Future[TreeMergeResponse] = { + if (request.dryRun) { + previewMerge(MergePreviewRequest( + haplogroupType = request.haplogroupType, + anchorHaplogroupName = None, + sourceTree = request.sourceTree, + sourceName = request.sourceName, + priorityConfig = request.priorityConfig + )).map(preview => TreeMergeResponse( + success = true, + message = "Dry run completed successfully", + statistics = preview.statistics, + conflicts = preview.conflicts, + splits = preview.splits + )) + } else { + performMerge( + haplogroupType = request.haplogroupType, + anchorId = None, + sourceTree = request.sourceTree, + sourceName = request.sourceName, + priorityConfig = request.priorityConfig.getOrElse(SourcePriorityConfig(List.empty)), + conflictStrategy = request.conflictStrategy.getOrElse(ConflictStrategy.HigherPriorityWins) + ) + } + } + + /** + * Merge a subtree under a specific anchor haplogroup. + */ + def mergeSubtree(request: SubtreeMergeRequest): Future[TreeMergeResponse] = { + if (request.dryRun) { + previewMerge(MergePreviewRequest( + haplogroupType = request.haplogroupType, + anchorHaplogroupName = Some(request.anchorHaplogroupName), + sourceTree = request.sourceTree, + sourceName = request.sourceName, + priorityConfig = request.priorityConfig + )).map(preview => TreeMergeResponse( + success = true, + message = "Dry run completed successfully", + statistics = preview.statistics, + conflicts = preview.conflicts, + splits = preview.splits + )) + } else { + for { + // Find the anchor haplogroup + anchorOpt <- haplogroupRepository.getHaplogroupByName(request.anchorHaplogroupName, request.haplogroupType) + anchor = anchorOpt.getOrElse( + throw new IllegalArgumentException(s"Anchor haplogroup '${request.anchorHaplogroupName}' not found") + ) + + result <- performMerge( + haplogroupType = request.haplogroupType, + anchorId = anchor.id, + sourceTree = request.sourceTree, + sourceName = request.sourceName, + priorityConfig = request.priorityConfig.getOrElse(SourcePriorityConfig(List.empty)), + conflictStrategy = request.conflictStrategy.getOrElse(ConflictStrategy.HigherPriorityWins) + ) + } yield result + } + } + + /** + * Preview merge without applying changes. + */ + def previewMerge(request: MergePreviewRequest): Future[MergePreviewResponse] = { + for { + // Build variant-based index of existing haplogroups + existingIndex <- buildVariantIndex(request.haplogroupType) + + // Simulate the merge to collect statistics + preview <- simulateMerge( + sourceTree = request.sourceTree, + sourceName = request.sourceName, + existingIndex = existingIndex, + priorityConfig = request.priorityConfig.getOrElse(SourcePriorityConfig(List.empty)) + ) + } yield preview + } + + // ============================================================================ + // Private Implementation + // ============================================================================ + + /** + * Build an index of existing haplogroups by their variant names. + * This enables variant-based matching across different naming conventions. + */ + private def buildVariantIndex(haplogroupType: HaplogroupType): Future[VariantIndex] = { + haplogroupRepository.getAllWithVariantNames(haplogroupType).map { haplogroupsWithVariants => + val variantToHaplogroup = haplogroupsWithVariants.flatMap { case (hg, variants) => + variants.map(v => v.toUpperCase -> hg) + }.groupMap(_._1)(_._2) + + val haplogroupByName = haplogroupsWithVariants.map { case (hg, _) => + hg.name.toUpperCase -> hg + }.toMap + + VariantIndex(variantToHaplogroup, haplogroupByName) + } + } + + /** + * Perform the actual merge operation. + */ + private def performMerge( + haplogroupType: HaplogroupType, + anchorId: Option[Int], + sourceTree: PhyloNodeInput, + sourceName: String, + priorityConfig: SourcePriorityConfig, + conflictStrategy: ConflictStrategy + ): Future[TreeMergeResponse] = { + val now = LocalDateTime.now() + val context = MergeContext( + haplogroupType = haplogroupType, + sourceName = sourceName, + priorityConfig = priorityConfig, + conflictStrategy = conflictStrategy, + timestamp = now + ) + + for { + // Build variant-based index + existingIndex <- buildVariantIndex(haplogroupType) + + // Perform recursive merge + result <- mergeNode( + node = sourceTree, + parentId = anchorId, + context = context, + index = existingIndex, + accumulator = MergeAccumulator.empty + ) + } yield TreeMergeResponse( + success = result.errors.isEmpty, + message = if (result.errors.isEmpty) "Merge completed successfully" else "Merge completed with errors", + statistics = result.statistics, + conflicts = result.conflicts, + splits = result.splits, + errors = result.errors + ) + } + + /** + * Recursively merge a node and its children. + */ + private def mergeNode( + node: PhyloNodeInput, + parentId: Option[Int], + context: MergeContext, + index: VariantIndex, + accumulator: MergeAccumulator + ): Future[MergeAccumulator] = { + // Try to find existing haplogroup by variants first, then by name + val existingMatch = findExistingMatch(node, index) + + existingMatch match { + case Some(existing) => + // Node exists - check for updates or splits + mergeExistingNode(node, existing, parentId, context, index, accumulator) + + case None => + // New node - create it + createNewNode(node, parentId, context, index, accumulator) + } + } + + /** + * Find an existing haplogroup that matches the input node. + * Primary matching is by variants (including aliases); fallback is by name. + */ + private def findExistingMatch(node: PhyloNodeInput, index: VariantIndex): Option[Haplogroup] = { + // First try variant-based matching - check primary names and all aliases + val allNames = allVariantNames(node.variants) + val variantMatches = allNames + .flatMap(v => index.variantToHaplogroup.getOrElse(v.toUpperCase, Seq.empty)) + .groupBy(identity) + .view.mapValues(_.size) + .toSeq + .sortBy(-_._2) // Sort by match count descending + + // Find haplogroup with most variant matches (>= 1) + variantMatches.headOption.filter(_._2 >= 1).map(_._1).orElse { + // Fallback: match by name + index.haplogroupByName.get(node.name.toUpperCase) + } + } + + /** + * Merge an input node with an existing haplogroup. + */ + private def mergeExistingNode( + node: PhyloNodeInput, + existing: Haplogroup, + parentId: Option[Int], + context: MergeContext, + index: VariantIndex, + accumulator: MergeAccumulator + ): Future[MergeAccumulator] = { + val conflicts = scala.collection.mutable.ListBuffer.empty[MergeConflict] + + // Check for field conflicts + val existingSource = existing.provenance.map(_.primaryCredit).getOrElse(existing.source) + + // Determine if we should update based on conflict strategy + val shouldUpdate = context.conflictStrategy match { + case ConflictStrategy.AlwaysUpdate => true + case ConflictStrategy.KeepExisting => false + case ConflictStrategy.HigherPriorityWins => + getPriority(context.sourceName, context.priorityConfig) < + getPriority(existingSource, context.priorityConfig) + } + + // Check for age estimate conflicts + if (node.formedYbp.isDefined && existing.formedYbp.isDefined && + node.formedYbp != existing.formedYbp) { + conflicts += MergeConflict( + haplogroupName = existing.name, + field = "formedYbp", + existingValue = existing.formedYbp.get.toString, + newValue = node.formedYbp.get.toString, + resolution = if (shouldUpdate) "updated" else "kept_existing", + existingSource = existingSource, + newSource = context.sourceName + ) + } + + for { + // Update provenance to track this merge + _ <- updateProvenance(existing, node.variants, context) + + // Update age estimates if applicable + _ <- if (shouldUpdate && hasAgeEstimates(node)) { + updateAgeEstimates(existing.id.get, node, context.sourceName) + } else { + Future.successful(()) + } + + // Update statistics + updatedStats = if (shouldUpdate && conflicts.nonEmpty) { + accumulator.statistics.copy( + nodesProcessed = accumulator.statistics.nodesProcessed + 1, + nodesUpdated = accumulator.statistics.nodesUpdated + 1 + ) + } else { + accumulator.statistics.copy( + nodesProcessed = accumulator.statistics.nodesProcessed + 1, + nodesUnchanged = accumulator.statistics.nodesUnchanged + 1 + ) + } + + // Recursively process children + childrenResult <- processChildren( + children = node.children, + parentId = existing.id, + context = context, + index = index, + accumulator = accumulator.copy( + statistics = updatedStats, + conflicts = accumulator.conflicts ++ conflicts.toList + ) + ) + } yield childrenResult + } + + /** + * Create a new haplogroup node. + */ + private def createNewNode( + node: PhyloNodeInput, + parentId: Option[Int], + context: MergeContext, + index: VariantIndex, + accumulator: MergeAccumulator + ): Future[MergeAccumulator] = { + // Determine credit - incoming source gets credit for new nodes + val primaryCredit = context.sourceName + val variantNames = primaryVariantNames(node.variants) + val provenance = HaplogroupProvenance.forNewNode(context.sourceName, variantNames) + + val newHaplogroup = Haplogroup( + id = None, + name = node.name, + lineage = None, + description = None, + haplogroupType = context.haplogroupType, + revisionId = 1, + source = context.sourceName, + confidenceLevel = "medium", + validFrom = context.timestamp, + validUntil = None, + formedYbp = node.formedYbp, + formedYbpLower = node.formedYbpLower, + formedYbpUpper = node.formedYbpUpper, + tmrcaYbp = node.tmrcaYbp, + tmrcaYbpLower = node.tmrcaYbpLower, + tmrcaYbpUpper = node.tmrcaYbpUpper, + ageEstimateSource = Some(context.sourceName), + provenance = Some(provenance) + ) + + for { + // Create the haplogroup with parent relationship + newId <- haplogroupRepository.createWithParent(newHaplogroup, parentId, context.sourceName) + + // Associate variants with the new haplogroup + variantCount <- associateVariants(newId, node.variants) + + // Update statistics + updatedStats = accumulator.statistics.copy( + nodesProcessed = accumulator.statistics.nodesProcessed + 1, + nodesCreated = accumulator.statistics.nodesCreated + 1, + variantsAdded = accumulator.statistics.variantsAdded + variantCount, + relationshipsCreated = if (parentId.isDefined) + accumulator.statistics.relationshipsCreated + 1 + else + accumulator.statistics.relationshipsCreated + ) + + // Update index with new haplogroup - include all variant names (primary + aliases) for matching + allVarNames = allVariantNames(node.variants) + updatedIndex = index.copy( + haplogroupByName = index.haplogroupByName + (node.name.toUpperCase -> newHaplogroup.copy(id = Some(newId))), + variantToHaplogroup = allVarNames.foldLeft(index.variantToHaplogroup) { (idx, v) => + idx.updatedWith(v.toUpperCase) { + case Some(hgs) => Some(hgs :+ newHaplogroup.copy(id = Some(newId))) + case None => Some(Seq(newHaplogroup.copy(id = Some(newId)))) + } + } + ) + + // Recursively process children + childrenResult <- processChildren( + children = node.children, + parentId = Some(newId), + context = context, + index = updatedIndex, + accumulator = accumulator.copy(statistics = updatedStats) + ) + } yield childrenResult + } + + /** + * Process child nodes recursively. + */ + private def processChildren( + children: List[PhyloNodeInput], + parentId: Option[Int], + context: MergeContext, + index: VariantIndex, + accumulator: MergeAccumulator + ): Future[MergeAccumulator] = { + children.foldLeft(Future.successful(accumulator)) { (accFuture, child) => + accFuture.flatMap { acc => + mergeNode(child, parentId, context, index, acc) + } + } + } + + /** + * Update provenance for an existing haplogroup. + */ + private def updateProvenance( + existing: Haplogroup, + newVariants: List[VariantInput], + context: MergeContext + ): Future[Boolean] = { + val existingProvenance = existing.provenance.getOrElse( + HaplogroupProvenance(primaryCredit = existing.source, nodeProvenance = Set(existing.source)) + ) + + // Preserve ISOGG credit + val primaryCredit = if (HaplogroupProvenance.shouldPreserveCredit(existingProvenance.primaryCredit)) { + existingProvenance.primaryCredit + } else { + existingProvenance.primaryCredit // Keep existing credit for non-ISOGG too + } + + // Add new source to node provenance + val updatedNodeProv = existingProvenance.nodeProvenance + context.sourceName + + // Add variant provenance for new variants (primary names only for provenance tracking) + val variantNames = primaryVariantNames(newVariants) + val updatedVariantProv = variantNames.foldLeft(existingProvenance.variantProvenance) { (prov, variant) => + prov.updatedWith(variant) { + case Some(sources) => Some(sources + context.sourceName) + case None => Some(Set(context.sourceName)) + } + } + + val updatedProvenance = HaplogroupProvenance( + primaryCredit = primaryCredit, + nodeProvenance = updatedNodeProv, + variantProvenance = updatedVariantProv, + lastMergedAt = Some(context.timestamp), + lastMergedFrom = Some(context.sourceName) + ) + + haplogroupRepository.updateProvenance(existing.id.get, updatedProvenance) + } + + /** + * Update age estimates for a haplogroup. + */ + private def updateAgeEstimates( + haplogroupId: Int, + node: PhyloNodeInput, + sourceName: String + ): Future[Boolean] = { + haplogroupRepository.findById(haplogroupId).flatMap { + case Some(existing) => + val updated = existing.copy( + formedYbp = node.formedYbp.orElse(existing.formedYbp), + formedYbpLower = node.formedYbpLower.orElse(existing.formedYbpLower), + formedYbpUpper = node.formedYbpUpper.orElse(existing.formedYbpUpper), + tmrcaYbp = node.tmrcaYbp.orElse(existing.tmrcaYbp), + tmrcaYbpLower = node.tmrcaYbpLower.orElse(existing.tmrcaYbpLower), + tmrcaYbpUpper = node.tmrcaYbpUpper.orElse(existing.tmrcaYbpUpper), + ageEstimateSource = Some(sourceName) + ) + haplogroupRepository.update(updated) + case None => + Future.successful(false) + } + } + + /** + * Associate variants with a haplogroup, finding or creating variants as needed. + */ + private def associateVariants(haplogroupId: Int, variants: List[VariantInput]): Future[Int] = { + if (variants.isEmpty) { + Future.successful(0) + } else { + // For each variant, find existing variants by primary name and associate them, + // then create alias records for any aliases + Future.traverse(variants) { variantInput => + // First find/associate the primary variant + variantRepository.searchByName(variantInput.name).flatMap { foundVariants => + // Associate all found variants with this haplogroup + val associateFutures = foundVariants.map { variant => + variant.variantId match { + case Some(vid) => + for { + // Associate variant with haplogroup + count <- haplogroupVariantRepository.addVariantToHaplogroup(haplogroupId, vid) + // Create alias records for any aliases from the ISOGG data + _ <- Future.traverse(variantInput.aliases) { alias => + val variantAlias = VariantAlias( + variantId = vid, + aliasType = "common_name", + aliasValue = alias, + source = Some("ISOGG"), + isPrimary = false + ) + variantAliasRepository.addAlias(variantAlias).recover { case _ => false } + } + } yield count + case None => Future.successful(0) + } + } + Future.sequence(associateFutures).map(_.sum) + } + }.map(_.sum) + } + } + + /** + * Get priority for a source (lower = higher priority). + */ + private def getPriority(source: String, config: SourcePriorityConfig): Int = { + config.sourcePriorities.indexOf(source) match { + case -1 => config.defaultPriority + case idx => idx + } + } + + /** + * Check if node has any age estimates. + */ + private def hasAgeEstimates(node: PhyloNodeInput): Boolean = { + node.formedYbp.isDefined || node.tmrcaYbp.isDefined + } + + /** + * Simulate merge without applying changes (for preview). + */ + private def simulateMerge( + sourceTree: PhyloNodeInput, + sourceName: String, + existingIndex: VariantIndex, + priorityConfig: SourcePriorityConfig + ): Future[MergePreviewResponse] = { + // Recursively analyze the tree + val (stats, conflicts, splits, newNodes, updatedNodes, unchangedNodes) = + analyzeTree(sourceTree, existingIndex, sourceName, priorityConfig) + + Future.successful(MergePreviewResponse( + statistics = stats, + conflicts = conflicts, + splits = splits, + newNodes = newNodes, + updatedNodes = updatedNodes, + unchangedNodes = unchangedNodes + )) + } + + /** + * Analyze tree structure for preview without making changes. + */ + private def analyzeTree( + node: PhyloNodeInput, + index: VariantIndex, + sourceName: String, + priorityConfig: SourcePriorityConfig + ): (MergeStatistics, List[MergeConflict], List[SplitOperation], List[String], List[String], List[String]) = { + + val existingMatch = findExistingMatch(node, index) + val conflicts = scala.collection.mutable.ListBuffer.empty[MergeConflict] + val splits = scala.collection.mutable.ListBuffer.empty[SplitOperation] + val newNodes = scala.collection.mutable.ListBuffer.empty[String] + val updatedNodes = scala.collection.mutable.ListBuffer.empty[String] + val unchangedNodes = scala.collection.mutable.ListBuffer.empty[String] + + var stats = existingMatch match { + case Some(existing) => + val existingSource = existing.provenance.map(_.primaryCredit).getOrElse(existing.source) + val shouldUpdate = getPriority(sourceName, priorityConfig) < getPriority(existingSource, priorityConfig) + + // Check for conflicts + if (node.formedYbp.isDefined && existing.formedYbp.isDefined && node.formedYbp != existing.formedYbp) { + conflicts += MergeConflict( + haplogroupName = existing.name, + field = "formedYbp", + existingValue = existing.formedYbp.get.toString, + newValue = node.formedYbp.get.toString, + resolution = if (shouldUpdate) "will_update" else "will_keep_existing", + existingSource = existingSource, + newSource = sourceName + ) + } + + if (shouldUpdate && conflicts.nonEmpty) { + updatedNodes += existing.name + MergeStatistics(1, 0, 1, 0, 0, 0, 0, 0, 0) + } else { + unchangedNodes += existing.name + MergeStatistics(1, 0, 0, 1, 0, 0, 0, 0, 0) + } + + case None => + newNodes += node.name + MergeStatistics(1, 1, 0, 0, node.variants.size, 0, 1, 0, 0) + } + + // Process children + node.children.foreach { child => + val (childStats, childConflicts, childSplits, childNew, childUpdated, childUnchanged) = + analyzeTree(child, index, sourceName, priorityConfig) + stats = MergeStatistics.combine(stats, childStats) + conflicts ++= childConflicts + splits ++= childSplits + newNodes ++= childNew + updatedNodes ++= childUpdated + unchangedNodes ++= childUnchanged + } + + (stats, conflicts.toList, splits.toList, newNodes.toList, updatedNodes.toList, unchangedNodes.toList) + } +} + +// ============================================================================ +// Internal Data Structures +// ============================================================================ + +/** + * Index of existing haplogroups for efficient lookup. + */ +private[services] case class VariantIndex( + variantToHaplogroup: Map[String, Seq[Haplogroup]], + haplogroupByName: Map[String, Haplogroup] +) + +/** + * Context for merge operations. + */ +private[services] case class MergeContext( + haplogroupType: HaplogroupType, + sourceName: String, + priorityConfig: SourcePriorityConfig, + conflictStrategy: ConflictStrategy, + timestamp: LocalDateTime +) + +/** + * Accumulator for merge statistics and results. + */ +private[services] case class MergeAccumulator( + statistics: MergeStatistics, + conflicts: List[MergeConflict], + splits: List[SplitOperation], + errors: List[String] +) + +private[services] object MergeAccumulator { + val empty: MergeAccumulator = MergeAccumulator( + statistics = MergeStatistics.empty, + conflicts = List.empty, + splits = List.empty, + errors = List.empty + ) +} diff --git a/app/services/HaplogroupTreeService.scala b/app/services/HaplogroupTreeService.scala index 5350296..fe49965 100644 --- a/app/services/HaplogroupTreeService.scala +++ b/app/services/HaplogroupTreeService.scala @@ -259,6 +259,22 @@ class HaplogroupTreeService @Inject()( } yield treeLists.flatten } + /** + * Finds and retrieves haplogroup details with all associated genomic variants. + * + * This method fetches the haplogroup (including provenance) and its linked variants. + * + * @param haplogroupName The name of the haplogroup for which details are to be retrieved. + * @param haplogroupType The type of haplogroup (e.g., Y-DNA or mtDNA). + * @return A Future containing a tuple of (Option[Haplogroup], Seq[VariantDTO]). + */ + def findHaplogroupWithVariants(haplogroupName: String, haplogroupType: HaplogroupType): Future[(Option[Haplogroup], Seq[VariantDTO])] = { + for { + haplogroup <- coreRepository.getHaplogroupByName(haplogroupName, haplogroupType) + variants <- findVariantsForHaplogroup(haplogroupName, haplogroupType) + } yield (haplogroup, variants) + } + /** * Finds and retrieves all genomic variants associated with a specified haplogroup. * diff --git a/app/views/curator/haplogroups/detailPanel.scala.html b/app/views/curator/haplogroups/detailPanel.scala.html index 3c0d0e0..e04f678 100644 --- a/app/views/curator/haplogroups/detailPanel.scala.html +++ b/app/views/curator/haplogroups/detailPanel.scala.html @@ -48,6 +48,72 @@
@haplogroup.name
} + @haplogroup.provenance.map { prov => +
+
Provenance
+
+
Primary Credit
+
+ + @prov.primaryCredit + +
+ + @if(prov.nodeProvenance.nonEmpty) { +
Contributors
+
+ @for(source <- prov.nodeProvenance.toSeq.sorted) { + @source + } +
+ } + + @prov.lastMergedFrom.map { source => +
Last Merged
+
+ + from @source + @prov.lastMergedAt.map { ts => + on @ts.toLocalDate + } + +
+ } + + @if(prov.variantProvenance.nonEmpty) { +
Variant Sources
+
+ +
+
    + @for((variant, sources) <- prov.variantProvenance.toSeq.sortBy(_._1).take(20)) { +
  • + @variant + + @for(src <- sources.toSeq.sorted) { + @src + } + +
  • + } + @if(prov.variantProvenance.size > 20) { +
  • + +@(prov.variantProvenance.size - 20) more variants... +
  • + } +
+
+
+ } +
+ } +
Tree Position
diff --git a/app/views/curator/haplogroups/listFragment.scala.html b/app/views/curator/haplogroups/listFragment.scala.html index 980cb90..a5f18be 100644 --- a/app/views/curator/haplogroups/listFragment.scala.html +++ b/app/views/curator/haplogroups/listFragment.scala.html @@ -11,7 +11,7 @@ Name Type Source - Confidence + Provenance @@ -29,7 +29,19 @@ @hg.source - @hg.confidenceLevel + + @hg.provenance.map { prov => + + @prov.primaryCredit + @if(prov.nodeProvenance.size > 1) { + +@(prov.nodeProvenance.size - 1) + } + + }.getOrElse { + - + } + + @provenance.map { prov => +
+
+ Primary Credit + @formatSourceName(prov.primaryCredit) +
+ @if(prov.nodeProvenance.size > 1) { +
+ Also contributed by: + + @prov.nodeProvenance.filterNot(_ == prov.primaryCredit).map(formatSourceName).mkString(", ") + +
+ } + @prov.lastMergedAt.map { timestamp => +
+ Last updated: + @timestamp.toLocalDate.toString + @prov.lastMergedFrom.map { source => + from @formatSourceName(source) + } +
+ } +
+ } + @if(snps.isEmpty) {

@messages("sidebar.noVariants", haplogroupName)

} else { @@ -174,6 +201,65 @@
@messages("sidebar.title", haplogroupName)
color: #888; font-style: italic; } + + .provenance-card { + background: linear-gradient(135deg, #e8f4fd 0%, #f0f7ff 100%); + border: 1px solid #b8d4e8; + border-radius: 8px; + padding: 12px; + margin-bottom: 15px; + } + + .provenance-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 8px; + } + + .provenance-label { + font-size: 0.8em; + color: #666; + text-transform: uppercase; + letter-spacing: 0.5px; + } + + .provenance-source { + font-weight: 600; + color: #2c5282; + font-size: 1em; + } + + .provenance-contributors { + font-size: 0.85em; + color: #555; + margin-bottom: 6px; + } + + .provenance-label-small { + color: #888; + font-size: 0.85em; + } + + .provenance-sources { + color: #4a5568; + } + + .provenance-merge-info { + font-size: 0.8em; + color: #718096; + border-top: 1px solid #cbd5e0; + padding-top: 8px; + margin-top: 8px; + } + + .provenance-timestamp { + color: #4a5568; + } + + .provenance-from { + color: #718096; + } @formatAliasType(aliasType: String) = @{ @@ -186,3 +272,13 @@
@messages("sidebar.title", haplogroupName)
case other => other.replace("_", " ").capitalize } } + +@formatSourceName(source: String) = @{ + source.toLowerCase match { + case "isogg" | "backbone" => "ISOGG" + case "yfull" => "YFull" + case "ftdna" => "FTDNA" + case "ytree" => "ytree" + case other => other.replace("_", " ").split(" ").map(_.capitalize).mkString(" ") + } +} diff --git a/conf/application.conf b/conf/application.conf index b249d7e..19cdd0d 100644 --- a/conf/application.conf +++ b/conf/application.conf @@ -10,6 +10,9 @@ play.i18n { # No need to create cookies in a read-only application. Remove when appropriate play.http.session.disabled=true +# Increase max request body size for tree merge API (default is 100KB) +play.http.parser.maxMemoryBuffer = 10MB + # Disable the startup banner play.application.showBanner=false diff --git a/conf/evolutions/default/52.sql b/conf/evolutions/default/52.sql new file mode 100644 index 0000000..44e581f --- /dev/null +++ b/conf/evolutions/default/52.sql @@ -0,0 +1,15 @@ +# --- !Ups + +-- Add provenance JSONB column to haplogroup table for multi-source attribution tracking +ALTER TABLE tree.haplogroup ADD COLUMN provenance JSONB; + +-- Add GIN index for efficient querying by provenance fields +CREATE INDEX idx_haplogroup_provenance ON tree.haplogroup USING GIN (provenance); + +-- Add comment for documentation +COMMENT ON COLUMN tree.haplogroup.provenance IS 'JSONB tracking node and variant provenance from multiple sources. Structure: {primaryCredit, nodeProvenance[], variantProvenance{}, lastMergedAt, lastMergedFrom}'; + +# --- !Downs + +DROP INDEX IF EXISTS tree.idx_haplogroup_provenance; +ALTER TABLE tree.haplogroup DROP COLUMN IF EXISTS provenance; diff --git a/conf/routes b/conf/routes index cbec6de..24aa699 100644 --- a/conf/routes +++ b/conf/routes @@ -184,6 +184,13 @@ PUT /api/v1/manage/str-markers/:id DELETE /api/v1/manage/str-markers/:id controllers.GenomeRegionsApiManagementController.deleteStrMarker(id: Int) POST /api/v1/manage/str-markers/bulk controllers.GenomeRegionsApiManagementController.bulkCreateStrMarkers() +# ============================================= +# Haplogroup Tree Merge API (X-API-Key secured) +# ============================================= +POST /api/v1/manage/haplogroups/merge controllers.HaplogroupTreeMergeController.mergeFullTree() +POST /api/v1/manage/haplogroups/merge/subtree controllers.HaplogroupTreeMergeController.mergeSubtree() +POST /api/v1/manage/haplogroups/merge/preview controllers.HaplogroupTreeMergeController.previewMerge() + # Curator Workflow GET /admin/publication-candidates controllers.PublicationCandidateController.listCandidates(page: Int ?= 1, pageSize: Int ?= 20) POST /admin/publication-candidates/:id/accept controllers.PublicationCandidateController.accept(id: Int) diff --git a/documents/proposals/variant-schema-simplification.md b/documents/proposals/variant-schema-simplification.md index a24f07b..c397d11 100644 --- a/documents/proposals/variant-schema-simplification.md +++ b/documents/proposals/variant-schema-simplification.md @@ -10,15 +10,19 @@ ## Key Design Decisions -| Decision | Rationale | -|----------|-----------| -| **Name is the primary identifier** | Coordinates can have parallel mutations; strand orientation varies | -| **No reference-agnostic mutation field** | G>A in GRCh38 may be C>T in hs1 (reverse complement) | -| **JSONB for coordinates** | Each assembly needs its own position AND alleles | -| **JSONB for aliases** | Flexible, no joins, supports multiple sources | -| **`defining_haplogroup_id` FK** | Distinguishes parallel mutations without .1/.2 suffixes | -| **Haplogroup context = implicit suffix** | Display "L21 (R-L21)" vs "L21 (I-L21)" instead of "L21.1" vs "L21.2" | -| **1 row per named variant per lineage** | Parallel mutations at same position = separate rows, same name allowed | +| Decision | Rationale | +|----------|-------------------------------------------------------------------------------------------------------| +| **Name is the primary identifier** | Coordinates can have parallel mutations; strand orientation varies | +| **No reference-agnostic mutation field** | G>A in GRCh38 may be C>T in hs1 (reverse complement) | +| **JSONB for coordinates** | Each assembly needs its own position AND alleles; structure varies by type | +| **JSONB for aliases** | Flexible, no joins, supports multiple sources | +| **`defining_haplogroup_id` FK** | Distinguishes parallel mutations without .1/.2 suffixes | +| **Haplogroup context = implicit suffix** | Display "L21 (R-L21)" vs "L21 (I-L21)" instead of "L21.1" vs "L21.2" | +| **1 row per named variant per lineage** | Parallel mutations at same position = separate rows, same name allowed | +| **Unified `variant_v2` for SNPs, STRs, SVs** | All phylogenetic characters in one table; `mutation_type` differentiates | +| **ASR character state tables** | `haplogroup_character_state` + `branch_mutation` support all variant types | +| **STRs typically don't define haplogroups** | Most STRs have `defining_haplogroup_id = NULL`; NULL alleles (e.g., R-U106>L1) can be branch-defining | +| **SVs can define branches** | Deletions, inversions, etc. are phylogenetically informative markers | --- @@ -27,6 +31,7 @@ | Document | Relationship | |----------|-------------| | `../planning/haplogroup-discovery-system.md` | **Blocked by this proposal.** Discovery system requires new variant schema for parallel mutation handling. Added as Phase -1 prerequisite. | +| `branch-age-estimation.md` | **Depends on this proposal.** Branch age estimation uses ASR tables defined here. The `haplogroup_character_state` table replaces the originally-proposed `haplogroup_ancestral_str` table, providing unified modal/ancestral values for all variant types via ASR. | --- @@ -168,8 +173,10 @@ CREATE TABLE variant_v2 ( variant_id SERIAL PRIMARY KEY, -- Identity (stable across references) - canonical_name TEXT, -- Primary name (e.g., "M269"), NULL for unnamed/novel variants - mutation_type TEXT NOT NULL, -- "SNP", "INDEL", "MNP" + canonical_name TEXT, -- Primary name (e.g., "M269", "DYS456"), NULL for unnamed/novel variants + mutation_type TEXT NOT NULL, -- Point: "SNP", "INDEL", "MNP" + -- Repeat: "STR" + -- Structural: "DEL", "DUP", "INS", "INV", "CNV", "TRANS" naming_status TEXT NOT NULL DEFAULT 'UNNAMED', -- UNNAMED, PENDING_REVIEW, NAMED -- NOTE: No mutation field - alleles stored per-coordinate due to strand differences @@ -188,40 +195,35 @@ CREATE TABLE variant_v2 ( -- Coordinates (JSONB - all reference positions) -- Keys use short reference names without patch versions (e.g., "GRCh38" not "GRCh38.p14") - -- NOTE: genbank_contig.reference_genome should also use these short names + -- Structure varies by mutation_type (see "Coordinate JSONB Structure by mutation_type" below) coordinates JSONB DEFAULT '{}', - -- Example: + -- SNP/INDEL/MNP example: -- { - -- "GRCh38": { - -- "contig": "chrY", - -- "position": 2887824, - -- "ref": "G", - -- "alt": "A" - -- }, - -- "GRCh37": { - -- "contig": "chrY", - -- "position": 2793009, - -- "ref": "G", - -- "alt": "A" - -- }, - -- "hs1": { - -- "contig": "chrY", - -- "position": 2912345, - -- "ref": "C", -- Reverse complemented! - -- "alt": "T" -- G→A becomes C→T - -- }, - -- "pangenome_v1": { - -- "node": "chrY.segment.12345", - -- "offset": 42, - -- "ref": "G", - -- "alt": "A" - -- } + -- "GRCh38": {"contig": "chrY", "position": 2887824, "ref": "G", "alt": "A"}, + -- "hs1": {"contig": "chrY", "position": 2912345, "ref": "C", "alt": "T"} -- reverse complemented + -- } + -- + -- STR example: + -- { + -- "GRCh38": {"contig": "chrY", "start": 12997923, "end": 12998019, "repeat_motif": "GATA", "period": 4} + -- } + -- + -- DEL/DUP/INS example: + -- { + -- "GRCh38": {"contig": "chrY", "start": 58819361, "end": 58913456, "length": 94095} -- } - -- Phylogenetic context (for haplogroup-defining variants) + -- Phylogenetic context + -- For SNPs/SVs: the haplogroup this variant defines + -- For STRs: usually NULL (repeat counts don't define haplogroups), but NULL alleles CAN define branches defining_haplogroup_id INTEGER REFERENCES tree.haplogroup(haplogroup_id), - -- Metadata + -- Additional metadata (optional, for GFF/evidence enrichment) + evidence JSONB DEFAULT '{}', -- {"yseq": {"tested": 5, "derived": 1}} + primers JSONB DEFAULT '{}', -- {"yseq": {"forward": "A1069_F", "reverse": "A1069_R"}} + notes TEXT, -- "Downstream of S1121." + + -- Timestamps created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); @@ -782,6 +784,626 @@ YBrowse aggregates from organizations that meet these criteria: --- +## Extension: STRs and Ancestral State Reconstruction + +### Rethinking STRs as Phylogenetic Characters + +Initially, STRs might seem fundamentally different from SNPs: + +| Aspect | SNP | STR | +|--------|-----|-----| +| **Variation type** | Single nucleotide change | Repeat count variation | +| **Allele representation** | ref/alt bases (G→A) | Repeat count (e.g., 12, 13, 14) | +| **Mutation rate** | ~10⁻⁸ per generation | ~10⁻³ per generation (1000x higher) | +| **Mutation model** | Infinite sites (rarely back-mutates) | Stepwise (can increase/decrease) | + +However, with **Ancestral State Reconstruction (ASR)**, both become phylogenetic characters: + +| Concept | SNP | STR | +|---------|-----|-----| +| **Character** | The variant itself | The STR marker | +| **Character state** | Ancestral (G) or Derived (A) | Repeat count (12, 13, 14...) | +| **ASR output** | Inferred ancestral allele at each node | Inferred repeat count at each node | +| **Branch annotation** | "M269: G→A" | "DYS456: 15→16" | + +This suggests **STRs should be unified with variants**, not kept separate. + +### STR Coordinate Structure + +STRs use `mutation_type = 'STR'` in the canonical `variant_v2` table (see "Proposed Schema" above). The `coordinates` JSONB for STRs includes: + +```json +{ + "GRCh38": { + "contig": "chrY", + "start": 12997923, + "end": 12998019, + "repeat_motif": "GATA", + "period": 4, + "reference_repeats": 13 + } +} +``` + +**Key difference from SNPs**: Most STRs have `defining_haplogroup_id = NULL` because repeat count variation doesn't typically define haplogroups—states are reconstructed at all nodes via ASR. However, **NULL alleles** (e.g., DYS439 NULL under R-U106, also known as L1/S26) can be branch-defining and would have a `defining_haplogroup_id` set. + +### Ancestral State Reconstruction Tables + +ASR produces inferred character states at internal tree nodes. This applies to **both SNPs and STRs**: + +```sql +-- Reconstructed character states at haplogroup nodes +CREATE TABLE haplogroup_character_state ( + id SERIAL PRIMARY KEY, + haplogroup_id INT NOT NULL REFERENCES haplogroup(haplogroup_id), + variant_id INT NOT NULL REFERENCES variant_v2(variant_id), + + -- The inferred state at this node + -- For SNPs: "ancestral" or "derived" (or the actual allele: "G", "A") + -- For STRs: the repeat count as string (e.g., "15") or "NULL" for null alleles + inferred_state TEXT NOT NULL, + + -- Confidence/probability from ASR algorithm + confidence DECIMAL(5,4), -- 0.0000 to 1.0000 + + -- For STRs: probability distribution over states (optional, for uncertain reconstructions) + state_probabilities JSONB, + -- Example: {"13": 0.05, "14": 0.25, "15": 0.65, "16": 0.05} + + -- ASR metadata + algorithm TEXT, -- "parsimony", "ml", "bayesian" + reconstructed_at TIMESTAMPTZ DEFAULT NOW(), + + UNIQUE(haplogroup_id, variant_id) +); + +CREATE INDEX idx_character_state_haplogroup ON haplogroup_character_state(haplogroup_id); +CREATE INDEX idx_character_state_variant ON haplogroup_character_state(variant_id); +``` + +### Branch Mutations (State Changes) + +For tree visualization and analysis, track where states change along branches: + +```sql +-- State changes along tree branches +CREATE TABLE branch_mutation ( + id SERIAL PRIMARY KEY, + variant_id INT NOT NULL REFERENCES variant_v2(variant_id), + + -- The branch where the mutation occurred (parent → child) + parent_haplogroup_id INT NOT NULL REFERENCES haplogroup(haplogroup_id), + child_haplogroup_id INT NOT NULL REFERENCES haplogroup(haplogroup_id), + + -- State transition + from_state TEXT NOT NULL, -- "G" or "15" + to_state TEXT NOT NULL, -- "A" or "16" + + -- For STRs: direction of change + -- +1 = expansion, -1 = contraction, NULL for SNPs + step_direction INT, + + -- Confidence from ASR + confidence DECIMAL(5,4), + + UNIQUE(variant_id, parent_haplogroup_id, child_haplogroup_id) +); + +CREATE INDEX idx_branch_mutation_child ON branch_mutation(child_haplogroup_id); +``` + +### Query Examples with Unified Model + +```sql +-- Get all character states at a haplogroup node (SNPs and STRs together) +SELECT + v.canonical_name, + v.mutation_type, + hcs.inferred_state, + hcs.confidence +FROM haplogroup_character_state hcs +JOIN variant_v2 v ON hcs.variant_id = v.variant_id +WHERE hcs.haplogroup_id = 12345 +ORDER BY v.mutation_type, v.canonical_name; + +-- Get STR mutations along a branch (useful for age estimation) +SELECT + v.canonical_name, + bm.from_state, + bm.to_state, + bm.step_direction +FROM branch_mutation bm +JOIN variant_v2 v ON bm.variant_id = v.variant_id +WHERE bm.child_haplogroup_id = 12345 + AND v.mutation_type = 'STR'; + +-- Reconstruct ancestral STR haplotype at a node +SELECT + v.canonical_name, + hcs.inferred_state as repeat_count, + hcs.confidence +FROM haplogroup_character_state hcs +JOIN variant_v2 v ON hcs.variant_id = v.variant_id +WHERE hcs.haplogroup_id = 12345 + AND v.mutation_type = 'STR' +ORDER BY v.canonical_name; +``` + +### Biosample Observations + +Observed values from actual samples (input to ASR): + +```sql +-- Observed character states from biosamples +CREATE TABLE biosample_variant_call ( + id SERIAL PRIMARY KEY, + biosample_id INT NOT NULL REFERENCES biosample(id), + variant_id INT NOT NULL REFERENCES variant_v2(variant_id), + + -- The observed state + -- For SNPs: "ref", "alt", "het", or actual alleles + -- For STRs: repeat count as string (e.g., "15") + observed_state TEXT NOT NULL, + + -- Call quality + quality_score INT, + read_depth INT, + confidence TEXT, -- "high", "medium", "low" + + -- Source + source TEXT, -- "ftdna", "yfull", "user_upload" + created_at TIMESTAMPTZ DEFAULT NOW(), + + UNIQUE(biosample_id, variant_id) +); +``` + +### STR Mutation Rate Reference Data + +STR-based ASR and age estimation require per-marker mutation rates. This reference table supports both: + +```sql +-- Per-marker STR mutation rates (reference data) +CREATE TABLE str_mutation_rate ( + id SERIAL PRIMARY KEY, + marker_name TEXT NOT NULL UNIQUE, -- DYS456, DYS389I, etc. + panel_names TEXT[], -- PowerPlex, YHRD, BigY, etc. + + -- Mutation rate per generation + mutation_rate DECIMAL(12,10) NOT NULL, + mutation_rate_lower DECIMAL(12,10), -- 95% CI lower + mutation_rate_upper DECIMAL(12,10), -- 95% CI upper + + -- Directional bias (for stepwise mutation model) + omega_plus DECIMAL(5,4) DEFAULT 0.5, -- Probability of expansion + omega_minus DECIMAL(5,4) DEFAULT 0.5, -- Probability of contraction + + -- Multi-step mutation frequencies + multi_step_rate DECIMAL(5,4), -- ω±2 + ω±3 + ... + + source TEXT, -- Ballantyne 2010, Willems 2016, etc. + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- Link variant_v2 STR entries to their mutation rates +CREATE INDEX idx_str_mutation_rate_marker ON str_mutation_rate(marker_name); +``` + +**Sources**: Ballantyne et al. 2010 (186 markers), Willems et al. 2016 (702 markers) + +### Why Unification Makes Sense + +1. **ASR treats them the same**: Both are characters with states at nodes +2. **Tree visualization**: Show SNP and STR mutations on branches together +3. **Age estimation**: STR mutation counts inform TMRCA calculations +4. **Simpler queries**: One table for "all variants at this haplogroup" +5. **Consistent coordinate pattern**: JSONB handles the structural differences + +### What Differs (handled in JSONB/metadata) + +| Aspect | SNP | STR | How Handled | +|--------|-----|-----|-------------| +| Allele type | Bases | Count or NULL | `coordinates` JSONB structure differs | +| Defines haplogroup | Usually yes | Rarely (NULL alleles can) | `defining_haplogroup_id` usually NULL for STRs | +| State space | Binary (anc/der) | Integer range + NULL | `state_probabilities` JSONB | +| Mutation model | Infinite sites | Stepwise | `step_direction` in branch_mutation | +| Rate data | Fixed per region | Per-marker | `str_mutation_rate` reference table | + +### Modal Haplotypes (Derived from ASR) + +Modal haplotypes become a **view over ASR results**, not a separate table: + +```sql +-- Modal haplotype is just the reconstructed state at the haplogroup node +CREATE VIEW haplogroup_str_modal AS +SELECT + hcs.haplogroup_id, + v.canonical_name as str_name, + hcs.inferred_state::int as modal_value, + hcs.confidence +FROM haplogroup_character_state hcs +JOIN variant_v2 v ON hcs.variant_id = v.variant_id +WHERE v.mutation_type = 'STR'; +``` + +### Current STR Schema Migration Path + +The existing `str_marker` table (from 50.sql) migrates into `variant_v2`: + +```sql +INSERT INTO variant_v2 (canonical_name, mutation_type, naming_status, coordinates) +SELECT + sm.name, + 'STR', + 'NAMED', + jsonb_build_object( + gc.reference_genome, + jsonb_build_object( + 'contig', gc.common_name, + 'start', sm.start_pos, + 'end', sm.end_pos, + 'period', sm.period, + 'repeat_motif', 'UNKNOWN' -- Will need enrichment from external data + ) + ) +FROM str_marker sm +JOIN genbank_contig gc ON sm.genbank_contig_id = gc.genbank_contig_id; +``` + +--- + +## Extension: Structural Variants (SVs) + +### SVs as Branch-Defining Markers + +Structural variants are phylogenetically informative and can define haplogroup branches, as demonstrated in [Poznik et al. 2016](https://www.science.org/doi/10.1126/science.aab3812) and subsequent work on Y chromosome phylogeny. + +This means SVs should be **unified with SNPs and STRs** in `variant_v2`, not kept separate. + +### SV Characteristics + +| SV Type | Description | Size Range | +|---------|-------------|------------| +| **Deletion (DEL)** | Sequence removed | 50bp - Mb | +| **Duplication (DUP)** | Sequence copied | 50bp - Mb | +| **Insertion (INS)** | Sequence added | 50bp - Mb | +| **Inversion (INV)** | Sequence reversed | 1kb - Mb | +| **Translocation (TRANS)** | Sequence moved to different location | Variable | +| **CNV** | Copy Number Variant (special case of DUP/DEL) | Variable | + +### SV Coordinate Challenges + +SVs have unique coordinate considerations (handled in JSONB): + +1. **Breakpoint precision**: Start/end may be imprecise (±bp) → `confidence_interval` +2. **Reference content**: May need sequence hash → `deleted_sequence_hash` +3. **Complex events**: Inversions have inner coordinates → `inner_start`, `inner_end` +4. **Size**: Length stored explicitly → `length` + +### Unified Schema: SVs in variant_v2 + +SVs become additional `mutation_type` values in the unified table: + +```sql +-- mutation_type now includes: +-- Point mutations: 'SNP', 'INDEL', 'MNP' +-- Repeat variations: 'STR' +-- Structural variants: 'DEL', 'DUP', 'INS', 'INV', 'CNV', 'TRANS' + +-- SV coordinate examples in variant_v2: + +-- Deletion example: +-- { +-- "GRCh38": { +-- "contig": "chrY", +-- "start": 58819361, +-- "end": 58913456, +-- "length": 94095, +-- "confidence_interval_start": [-50, 50], +-- "confidence_interval_end": [-100, 100], +-- "deleted_sequence_hash": "sha256:abc123..." +-- } +-- } + +-- Inversion example: +-- { +-- "GRCh38": { +-- "contig": "chrY", +-- "start": 58819361, +-- "end": 58913456, +-- "length": 94095, +-- "inner_start": 58819500, +-- "inner_end": 58913300 +-- } +-- } + +-- CNV example (with copy number): +-- { +-- "GRCh38": { +-- "contig": "chrY", +-- "start": 23800000, +-- "end": 24100000, +-- "length": 300000, +-- "reference_copies": 2, +-- "copy_number_range": [0, 4] +-- } +-- } +``` + +### SVs in ASR Context + +Like SNPs and STRs, SVs have character states at tree nodes: + +| Variant Type | Character State | Example | +|--------------|-----------------|---------| +| SNP | Ancestral or Derived allele | G or A | +| STR | Repeat count or NULL | 15, NULL | +| DEL/DUP/INS | Presence/Absence | "present" or "absent" | +| INV | Orientation | "forward" or "inverted" | +| CNV | Copy number | 0, 1, 2, 3... | + +The `haplogroup_character_state` and `branch_mutation` tables handle all of these: + +```sql +-- SV state at a node +INSERT INTO haplogroup_character_state (haplogroup_id, variant_id, inferred_state, confidence) +VALUES (12345, 999, 'present', 0.98); -- Deletion is present at this haplogroup + +-- SV mutation on a branch +INSERT INTO branch_mutation (variant_id, parent_haplogroup_id, child_haplogroup_id, from_state, to_state) +VALUES (999, 100, 12345, 'absent', 'present'); -- Deletion arose on this branch +``` + +### Known Branch-Defining Y-DNA SVs + +| Name | Type | Size | Defining Haplogroup | Reference | +|------|------|------|---------------------|-----------| +| **AZFa deletion** | DEL | ~800kb | Multiple independent | Medical/fertility | +| **AZFb deletion** | DEL | ~6.2Mb | Multiple independent | Medical/fertility | +| **AZFc deletion** | DEL | ~3.5Mb | Multiple independent | Medical/fertility | +| **gr/gr deletion** | DEL | ~1.6Mb | Various | Repping et al. 2006 | +| **IR2 inversion** | INV | ~300kb | Specific lineages | Poznik et al. 2016 | +| **P1-P8 palindrome variants** | Various | Variable | Various | Skaletsky et al. 2003 | + +### SV Evidence Fields + +SVs often require additional evidence metadata: + +```sql +-- The existing 'evidence' JSONB field in variant_v2 handles this: +-- { +-- "call_method": "read_depth", -- or "split_read", "paired_end", "assembly" +-- "supporting_reads": 45, +-- "quality_score": 99, +-- "callers_agreeing": ["manta", "delly", "lumpy"], +-- "validated": true, +-- "validation_method": "PCR" +-- } +``` + +### Query Examples + +```sql +-- Find all SVs defining a haplogroup +SELECT v.canonical_name, v.mutation_type, v.coordinates +FROM variant_v2 v +WHERE v.defining_haplogroup_id = 12345 + AND v.mutation_type IN ('DEL', 'DUP', 'INS', 'INV', 'CNV', 'TRANS'); + +-- Get all branch-defining variants (SNPs + SVs) for a haplogroup +SELECT v.canonical_name, v.mutation_type, + bm.from_state, bm.to_state +FROM branch_mutation bm +JOIN variant_v2 v ON bm.variant_id = v.variant_id +WHERE bm.child_haplogroup_id = 12345; + +-- Find large deletions (>100kb) +SELECT v.canonical_name, + (v.coordinates->'GRCh38'->>'length')::int as length_bp +FROM variant_v2 v +WHERE v.mutation_type = 'DEL' + AND (v.coordinates->'GRCh38'->>'length')::int > 100000; +``` + +--- + +## Extension: Genome Region Annotations + +### Genome Annotation Tables + +The non-variant tables in evolution 50.sql share the multi-reference coordinate challenge: + +| Table | Purpose | Coordinate Nature | +|-------|---------|-------------------| +| `genome_region` | Structural annotations (centromere, PAR, etc.) | Start/end positions | +| `cytoband` | Cytogenetic bands | Start/end positions | + +**Note**: The `str_marker` table from 50.sql migrates into `variant_v2` (see "STR Migration" below), since STRs are phylogenetic characters used in ASR. + +### Current Schema (50.sql) + +```sql +-- Structural regions +CREATE TABLE genome_region ( + id SERIAL PRIMARY KEY, + genbank_contig_id INT NOT NULL REFERENCES genbank_contig(genbank_contig_id), + region_type VARCHAR(30) NOT NULL, -- Centromere, Telomere_P, PAR1, etc. + name VARCHAR(50), -- For named regions (P1-P8 palindromes) + start_pos BIGINT NOT NULL, + end_pos BIGINT NOT NULL, + modifier DECIMAL(3,2), -- Quality modifier + UNIQUE(genbank_contig_id, region_type, name, start_pos) +); + +-- Cytobands +CREATE TABLE cytoband ( + id SERIAL PRIMARY KEY, + genbank_contig_id INT NOT NULL REFERENCES genbank_contig(genbank_contig_id), + name VARCHAR(20) NOT NULL, -- p11.32, q11.21, etc. + start_pos BIGINT NOT NULL, + end_pos BIGINT NOT NULL, + stain VARCHAR(10) NOT NULL, -- gneg, gpos25, acen, etc. + UNIQUE(genbank_contig_id, name) +); +``` + +### Recommendation: Keep Regions Separate from Variants + +Genome annotations differ from variants in key ways: + +| Aspect | Variants (SNP/STR/SV) | Genome Annotations | +|--------|----------------------|-------------------| +| **Identity** | Name + haplogroup context | Name + region type | +| **Variability** | Varies between individuals | Fixed per reference | +| **Updates** | Continuous discovery | Per-reference updates | +| **Query pattern** | "Where is M269?" | "What region contains position X?" | + +**Recommendation**: Keep `genome_region` and `cytoband` in a separate table (`genome_region_v2`), but apply the JSONB coordinate pattern for multi-reference support: + +```sql +CREATE TABLE genome_region_v2 ( + region_id SERIAL PRIMARY KEY, + region_type TEXT NOT NULL, -- Centromere, Telomere_P, PAR1, XTR, etc. + name TEXT, -- For named regions (P1-P8 palindromes) + + -- Coordinates per reference (the key insight from variant_v2) + coordinates JSONB NOT NULL, + -- Example: + -- { + -- "GRCh38": { + -- "contig": "chrY", + -- "start": 10316944, + -- "end": 10544039 + -- }, + -- "GRCh37": { + -- "contig": "chrY", + -- "start": 10246944, + -- "end": 10474039 + -- }, + -- "hs1": { + -- "contig": "chrY", + -- "start": 10400000, + -- "end": 10627095 + -- } + -- } + + -- Region-specific metadata + properties JSONB DEFAULT '{}', + -- Example for regions: {"modifier": 0.5} + -- Example for cytobands: {"stain": "gpos75"} + + UNIQUE(region_type, name) +); + +CREATE INDEX idx_genome_region_v2_coords ON genome_region_v2 USING GIN(coordinates); + +-- Efficient lookup: "What region contains GRCh38:chrY:15000000?" +CREATE INDEX idx_genome_region_v2_grch38_range ON genome_region_v2 ( + (coordinates->'GRCh38'->>'contig'), + ((coordinates->'GRCh38'->>'start')::bigint), + ((coordinates->'GRCh38'->>'end')::bigint) +); +``` + +--- + +## Unified Coordinate Pattern Summary + +The key insight across all these schemas is the **JSONB coordinate pattern**: + +```json +{ + "GRCh38": { "contig": "chrY", "start": 12345, "end": 12345, ... }, + "GRCh37": { "contig": "chrY", "start": 12300, "end": 12300, ... }, + "hs1": { "contig": "chrY", "start": 12400, "end": 12400, ... } +} +``` + +### Schema Unification Summary + +| Feature | Table | Rationale | +|---------|-------|-----------| +| **SNP/INDEL/MNP** | `variant_v2` | Core point mutations | +| **STR** | `variant_v2` | Unified - phylogenetic characters for ASR | +| **SV (DEL/DUP/INS/INV/CNV)** | `variant_v2` | Unified - branch-defining markers | +| **STR mutation rates** | `str_mutation_rate` | Reference data - per-marker rates for ASR/age estimation | +| **Genome Region** | `genome_region_v2` | Separate - fixed per reference, not variants | +| **Cytoband** | `genome_region_v2` | Merged with regions via `properties` JSONB | + +**Key insight**: If it can define a branch or has states reconstructed by ASR, it belongs in `variant_v2`. + +### Coordinate JSONB Structure by mutation_type + +| mutation_type | Coordinate Fields | Extra Fields | +|---------------|-------------------|--------------| +| **SNP** | contig, position | ref, alt | +| **INDEL** | contig, position | ref, alt | +| **MNP** | contig, position | ref, alt | +| **STR** | contig, start, end | repeat_motif, period, reference_repeats | +| **DEL/DUP/INS** | contig, start, end, length | confidence_intervals, sequence_hash | +| **INV** | contig, start, end, length | inner_start, inner_end | +| **CNV** | contig, start, end, length | reference_copies, copy_number_range | + +| Separate Table | Coordinate Fields | Extra Fields | +|----------------|-------------------|--------------| +| **genome_region_v2** | contig, start, end | (in `properties`: modifier, stain) | + +### ASR Integration + +With ancestral state reconstruction, all variant types share these tables: + +| Table | Purpose | +|-------|---------| +| `haplogroup_character_state` | Inferred state at each tree node (replaces `haplogroup_ancestral_str` concept) | +| `branch_mutation` | State transitions along branches | +| `biosample_variant_call` | Observed values (input to ASR) | +| `str_mutation_rate` | Per-marker mutation rates for STR ASR/age estimation | + +| Variant Type | State Type | Example States | +|--------------|------------|----------------| +| SNP/INDEL/MNP | Allele | "G", "A", "ancestral", "derived" | +| STR | Repeat count or NULL | "13", "14", "15", "NULL" | +| DEL/DUP/INS | Presence | "present", "absent" | +| INV | Orientation | "forward", "inverted" | +| CNV | Copy number | "0", "1", "2", "3" | + +--- + +## Migration Considerations + +### STR Migration + +STRs migrate into `variant_v2` as `mutation_type = 'STR'`. See the migration query in the "STRs and Ancestral State Reconstruction" section above. + +### SV Migration + +SVs migrate into `variant_v2` with `mutation_type` set to the specific SV type ('DEL', 'DUP', 'INS', 'INV', 'CNV'). If there's no existing SV table, SVs will be ingested directly into the new schema. + +### Genome Region Migration + +```sql +INSERT INTO genome_region_v2 (region_type, name, coordinates, properties) +SELECT + gr.region_type, + gr.name, + jsonb_build_object( + gc.reference_genome, + jsonb_build_object( + 'contig', gc.common_name, + 'start', gr.start_pos, + 'end', gr.end_pos + ) + ) as coordinates, + CASE WHEN gr.modifier IS NOT NULL + THEN jsonb_build_object('modifier', gr.modifier) + ELSE '{}'::jsonb + END as properties +FROM genome_region gr +JOIN genbank_contig gc ON gr.genbank_contig_id = gc.genbank_contig_id; +``` + +--- + ## References - [GFA Format Specification](https://github.com/GFA-spec/GFA-spec) - Graph assembly format @@ -789,3 +1411,7 @@ YBrowse aggregates from organizations that meet these criteria: - [Human Pangenome Reference Consortium](https://humanpangenome.org/) - [PostgreSQL JSONB Documentation](https://www.postgresql.org/docs/current/datatype-json.html) - [ISOGG Y-DNA SNP Index](https://isogg.org/tree/) - Naming conventions +- [YHRD STR Database](https://yhrd.org/) - Y-STR reference +- [dbVar](https://www.ncbi.nlm.nih.gov/dbvar/) - NCBI Structural Variation database +- [Hallast et al. 2021](https://www.science.org/doi/10.1126/science.abg8871) - Y chromosome structural variants and phylogeny +- [Poznik et al. 2016](https://www.science.org/doi/10.1126/science.aab3812) - Punctuated bursts in human male demography diff --git a/test/controllers/HaplogroupTreeMergeControllerSpec.scala b/test/controllers/HaplogroupTreeMergeControllerSpec.scala new file mode 100644 index 0000000..fe3c36c --- /dev/null +++ b/test/controllers/HaplogroupTreeMergeControllerSpec.scala @@ -0,0 +1,514 @@ +package controllers + +import actions.ApiSecurityAction +import models.HaplogroupType +import models.api.haplogroups.* +import org.mockito.ArgumentMatchers.any +import org.mockito.Mockito.{reset, verify, when} +import org.scalatest.BeforeAndAfterEach +import org.scalatest.concurrent.ScalaFutures +import org.scalatestplus.mockito.MockitoSugar +import org.scalatestplus.play.PlaySpec +import org.scalatestplus.play.guice.GuiceOneAppPerSuite +import play.api.Application +import play.api.inject.bind +import play.api.inject.guice.GuiceApplicationBuilder +import play.api.libs.json.Json +import play.api.mvc.Results +import play.api.test.Helpers.* +import play.api.test.{FakeRequest, Injecting} +import services.HaplogroupTreeMergeService + +import scala.concurrent.{ExecutionContext, Future} + +class HaplogroupTreeMergeControllerSpec extends PlaySpec + with GuiceOneAppPerSuite + with Injecting + with MockitoSugar + with ScalaFutures + with BeforeAndAfterEach { + + // Mock service + val mockMergeService: HaplogroupTreeMergeService = mock[HaplogroupTreeMergeService] + + override def fakeApplication(): Application = { + new GuiceApplicationBuilder() + .configure( + "play.evolutions.enabled" -> false, + "api.key.enabled" -> false // Disable API key for testing + ) + .overrides( + bind[HaplogroupTreeMergeService].toInstance(mockMergeService) + ) + .build() + } + + override def beforeEach(): Unit = { + reset(mockMergeService) + } + + // Test fixtures + def createSuccessResponse(nodesCreated: Int = 5): TreeMergeResponse = TreeMergeResponse( + success = true, + message = "Merge completed successfully", + statistics = MergeStatistics( + nodesProcessed = 10, + nodesCreated = nodesCreated, + nodesUpdated = 3, + nodesUnchanged = 2, + variantsAdded = 20, + variantsUpdated = 5, + relationshipsCreated = 4, + relationshipsUpdated = 1, + splitOperations = 0 + ) + ) + + def createPreviewResponse(): MergePreviewResponse = MergePreviewResponse( + statistics = MergeStatistics(10, 5, 3, 2, 20, 5, 4, 1, 0), + conflicts = List.empty, + splits = List.empty, + newNodes = List("NewNode1", "NewNode2"), + updatedNodes = List("UpdatedNode1"), + unchangedNodes = List("UnchangedNode1") + ) + + "HaplogroupTreeMergeController" should { + + // ========================================================================= + // mergeFullTree endpoint tests + // ========================================================================= + + "return 200 for successful full tree merge" in { + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.successful(createSuccessResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj( + "name" -> "R1b", + "variants" -> Json.arr("M269") + ), + "sourceName" -> "ytree.net" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + contentType(result) mustBe Some("application/json") + + val json = contentAsJson(result) + (json \ "success").as[Boolean] mustBe true + (json \ "statistics" \ "nodesCreated").as[Int] mustBe 5 + } + + "return 400 for failed merge" in { + val failureResponse = TreeMergeResponse.failure( + "Merge validation failed", + List("Invalid tree structure") + ) + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.successful(failureResponse)) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj("name" -> "Invalid"), + "sourceName" -> "test" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe BAD_REQUEST + val json = contentAsJson(result) + (json \ "success").as[Boolean] mustBe false + } + + "reject invalid haplogroup type in JSON body" in { + val requestBody = Json.obj( + "haplogroupType" -> "INVALID_TYPE", + "sourceTree" -> Json.obj("name" -> "Test"), + "sourceName" -> "test" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + // The JSON parsing throws an exception for invalid HaplogroupType + // which propagates through Play's JSON body parser + an[IllegalArgumentException] must be thrownBy { + val result = route(app, request).get + status(result) + } + } + + "return 400 for missing required fields" in { + val requestBody = Json.obj( + "haplogroupType" -> "Y" + // Missing sourceTree and sourceName + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe BAD_REQUEST + } + + "handle service exceptions gracefully" in { + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.failed(new RuntimeException("Database connection failed"))) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj("name" -> "Test"), + "sourceName" -> "test" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe INTERNAL_SERVER_ERROR + val json = contentAsJson(result) + (json \ "success").as[Boolean] mustBe false + (json \ "errors").as[List[String]] must not be empty + } + + "pass through all request parameters to service" in { + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.successful(createSuccessResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj( + "name" -> "R1b", + "variants" -> Json.arr("M269"), + "formedYbp" -> 4500 + ), + "sourceName" -> "ytree.net", + "priorityConfig" -> Json.obj( + "sourcePriorities" -> Json.arr("ytree.net", "ISOGG") + ), + "conflictStrategy" -> "higher_priority_wins", + "dryRun" -> true + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + verify(mockMergeService).mergeFullTree(any[TreeMergeRequest]) + } + + // ========================================================================= + // mergeSubtree endpoint tests + // ========================================================================= + + "return 200 for successful subtree merge" in { + when(mockMergeService.mergeSubtree(any[SubtreeMergeRequest])) + .thenReturn(Future.successful(createSuccessResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "anchorHaplogroupName" -> "R1b", + "sourceTree" -> Json.obj( + "name" -> "R1b-L21", + "variants" -> Json.arr("L21") + ), + "sourceName" -> "ytree.net" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/subtree") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + val json = contentAsJson(result) + (json \ "success").as[Boolean] mustBe true + } + + "return 400 when anchor haplogroup not found" in { + when(mockMergeService.mergeSubtree(any[SubtreeMergeRequest])) + .thenReturn(Future.failed(new IllegalArgumentException("Anchor haplogroup 'NONEXISTENT' not found"))) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "anchorHaplogroupName" -> "NONEXISTENT", + "sourceTree" -> Json.obj("name" -> "Test"), + "sourceName" -> "test" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/subtree") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe BAD_REQUEST + val json = contentAsJson(result) + (json \ "message").as[String] must include("not found") + } + + "return 400 for missing anchorHaplogroupName" in { + val requestBody = Json.obj( + "haplogroupType" -> "Y", + // Missing anchorHaplogroupName + "sourceTree" -> Json.obj("name" -> "Test"), + "sourceName" -> "test" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/subtree") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe BAD_REQUEST + } + + // ========================================================================= + // previewMerge endpoint tests + // ========================================================================= + + "return 200 for preview request" in { + when(mockMergeService.previewMerge(any[MergePreviewRequest])) + .thenReturn(Future.successful(createPreviewResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj( + "name" -> "R1b", + "variants" -> Json.arr("M269") + ), + "sourceName" -> "ytree.net" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/preview") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + val json = contentAsJson(result) + (json \ "newNodes").as[List[String]] must contain("NewNode1") + (json \ "statistics" \ "nodesProcessed").as[Int] mustBe 10 + } + + "return preview with conflicts" in { + val previewWithConflicts = MergePreviewResponse( + statistics = MergeStatistics(10, 5, 3, 2, 20, 5, 4, 1, 0), + conflicts = List( + MergeConflict( + haplogroupName = "R1b-L21", + field = "formedYbp", + existingValue = "4500", + newValue = "4800", + resolution = "will_update", + existingSource = "ISOGG", + newSource = "ytree.net" + ) + ), + splits = List.empty, + newNodes = List.empty, + updatedNodes = List("R1b-L21"), + unchangedNodes = List.empty + ) + + when(mockMergeService.previewMerge(any[MergePreviewRequest])) + .thenReturn(Future.successful(previewWithConflicts)) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj("name" -> "R1b-L21", "formedYbp" -> 4800), + "sourceName" -> "ytree.net" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/preview") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + val json = contentAsJson(result) + (json \ "conflicts").as[List[MergeConflict]] must have size 1 + (json \ "conflicts" \ 0 \ "field").as[String] mustBe "formedYbp" + } + + "accept preview with optional anchor" in { + when(mockMergeService.previewMerge(any[MergePreviewRequest])) + .thenReturn(Future.successful(createPreviewResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "anchorHaplogroupName" -> "R1b", + "sourceTree" -> Json.obj("name" -> "R1b-L21"), + "sourceName" -> "ytree.net" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/preview") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + } + + "handle preview service exceptions" in { + when(mockMergeService.previewMerge(any[MergePreviewRequest])) + .thenReturn(Future.failed(new RuntimeException("Index build failed"))) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj("name" -> "Test"), + "sourceName" -> "test" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge/preview") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe INTERNAL_SERVER_ERROR + } + + // ========================================================================= + // MT DNA tests + // ========================================================================= + + "handle MT DNA haplogroup type" in { + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.successful(createSuccessResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "MT", + "sourceTree" -> Json.obj( + "name" -> "H1", + "variants" -> Json.arr("H1-defining") + ), + "sourceName" -> "mtDNA-tree" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + } + + // ========================================================================= + // Complex tree structure tests + // ========================================================================= + + "handle deeply nested tree in request" in { + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.successful(createSuccessResponse(nodesCreated = 10))) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj( + "name" -> "R1b", + "variants" -> Json.arr("M269"), + "children" -> Json.arr( + Json.obj( + "name" -> "R1b-L21", + "variants" -> Json.arr("L21"), + "children" -> Json.arr( + Json.obj( + "name" -> "R1b-DF13", + "variants" -> Json.arr("DF13"), + "children" -> Json.arr( + Json.obj( + "name" -> "R1b-Z39589", + "variants" -> Json.arr("Z39589") + ) + ) + ) + ) + ) + ) + ), + "sourceName" -> "ytree.net" + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + } + + // ========================================================================= + // Dry run tests + // ========================================================================= + + "handle dry run request" in { + when(mockMergeService.mergeFullTree(any[TreeMergeRequest])) + .thenReturn(Future.successful(createSuccessResponse())) + + val requestBody = Json.obj( + "haplogroupType" -> "Y", + "sourceTree" -> Json.obj("name" -> "Test"), + "sourceName" -> "test", + "dryRun" -> true + ) + + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withJsonBody(requestBody) + + val result = route(app, request).get + + status(result) mustBe OK + } + + // ========================================================================= + // Content-Type tests + // ========================================================================= + + "return 415 for non-JSON content type" in { + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "text/plain") + .withBody("not json") + + val result = route(app, request).get + + status(result) mustBe UNSUPPORTED_MEDIA_TYPE + } + + "return 400 for malformed JSON" in { + val request = FakeRequest(POST, "/api/v1/manage/haplogroups/merge") + .withHeaders("Content-Type" -> "application/json") + .withBody("{invalid json") + + val result = route(app, request).get + + status(result) mustBe BAD_REQUEST + } + } +} diff --git a/test/models/api/haplogroups/TreeMergeModelsSpec.scala b/test/models/api/haplogroups/TreeMergeModelsSpec.scala new file mode 100644 index 0000000..4e77597 --- /dev/null +++ b/test/models/api/haplogroups/TreeMergeModelsSpec.scala @@ -0,0 +1,576 @@ +package models.api.haplogroups + +import models.HaplogroupType +import org.scalatest.funspec.AnyFunSpec +import org.scalatest.matchers.must.Matchers +import play.api.libs.json.{JsError, JsSuccess, Json} + +class TreeMergeModelsSpec extends AnyFunSpec with Matchers { + + describe("VariantInput") { + + describe("JSON serialization") { + + it("should deserialize a simple variant") { + val json = Json.parse("""{"name": "M207"}""") + json.validate[VariantInput] match { + case JsSuccess(v, _) => + v.name mustBe "M207" + v.aliases mustBe List.empty + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize a variant with aliases") { + val json = Json.parse("""{"name": "M207", "aliases": ["Page37", "UTY2"]}""") + json.validate[VariantInput] match { + case JsSuccess(v, _) => + v.name mustBe "M207" + v.aliases mustBe List("Page37", "UTY2") + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should serialize to JSON") { + val variant = VariantInput("M207", List("Page37", "UTY2")) + val json = Json.toJson(variant) + (json \ "name").as[String] mustBe "M207" + (json \ "aliases").as[List[String]] mustBe List("Page37", "UTY2") + } + } + } + + describe("PhyloNodeInput") { + + describe("JSON serialization") { + + it("should deserialize a simple node with variant objects") { + val json = Json.parse("""{ + "name": "R1b-L21", + "variants": [{"name": "L21"}, {"name": "S145"}] + }""") + + json.validate[PhyloNodeInput] match { + case JsSuccess(node, _) => + node.name mustBe "R1b-L21" + node.variants.map(_.name) mustBe List("L21", "S145") + node.children mustBe List.empty + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize a node with variant aliases") { + val json = Json.parse("""{ + "name": "R", + "variants": [{"name": "M207", "aliases": ["Page37", "UTY2"]}] + }""") + + json.validate[PhyloNodeInput] match { + case JsSuccess(node, _) => + node.variants must have size 1 + node.variants.head.name mustBe "M207" + node.variants.head.aliases mustBe List("Page37", "UTY2") + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize node with all age fields") { + val json = Json.parse("""{ + "name": "R1b-L21", + "variants": [{"name": "L21"}], + "formedYbp": 4500, + "formedYbpLower": 4200, + "formedYbpUpper": 4800, + "tmrcaYbp": 4000, + "tmrcaYbpLower": 3700, + "tmrcaYbpUpper": 4300 + }""") + + json.validate[PhyloNodeInput] match { + case JsSuccess(node, _) => + node.formedYbp mustBe Some(4500) + node.formedYbpLower mustBe Some(4200) + node.formedYbpUpper mustBe Some(4800) + node.tmrcaYbp mustBe Some(4000) + node.tmrcaYbpLower mustBe Some(3700) + node.tmrcaYbpUpper mustBe Some(4300) + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize nested children") { + val json = Json.parse("""{ + "name": "R1b-L21", + "variants": [{"name": "L21"}], + "children": [ + { + "name": "R1b-DF13", + "variants": [{"name": "DF13"}], + "children": [ + { + "name": "R1b-Z39589", + "variants": [{"name": "Z39589"}] + } + ] + } + ] + }""") + + json.validate[PhyloNodeInput] match { + case JsSuccess(node, _) => + node.name mustBe "R1b-L21" + node.children must have size 1 + node.children.head.name mustBe "R1b-DF13" + node.children.head.children must have size 1 + node.children.head.children.head.name mustBe "R1b-Z39589" + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should serialize to JSON") { + val node = PhyloNodeInput( + name = "R1b-L21", + variants = List(VariantInput("L21"), VariantInput("S145")), + formedYbp = Some(4500), + children = List( + PhyloNodeInput(name = "R1b-DF13", variants = List(VariantInput("DF13"))) + ) + ) + + val json = Json.toJson(node) + + (json \ "name").as[String] mustBe "R1b-L21" + (json \ "variants").as[List[VariantInput]].map(_.name) mustBe List("L21", "S145") + (json \ "formedYbp").as[Int] mustBe 4500 + (json \ "children").as[List[PhyloNodeInput]] must have size 1 + } + + it("should handle empty variants list") { + val json = Json.parse("""{"name": "Test"}""") + + json.validate[PhyloNodeInput] match { + case JsSuccess(node, _) => + node.variants mustBe List.empty + node.children mustBe List.empty + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + } + } + + describe("SourcePriorityConfig") { + + it("should deserialize with priority list") { + val json = Json.parse("""{ + "sourcePriorities": ["ISOGG", "ytree.net", "DecodingUs"], + "defaultPriority": 50 + }""") + + json.validate[SourcePriorityConfig] match { + case JsSuccess(config, _) => + config.sourcePriorities mustBe List("ISOGG", "ytree.net", "DecodingUs") + config.defaultPriority mustBe 50 + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should use default priority of 100") { + val json = Json.parse("""{ + "sourcePriorities": ["ISOGG"] + }""") + + json.validate[SourcePriorityConfig] match { + case JsSuccess(config, _) => + config.defaultPriority mustBe 100 + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + } + + describe("ConflictStrategy") { + + it("should deserialize higher_priority_wins") { + val json = Json.parse("\"higher_priority_wins\"") + + json.validate[ConflictStrategy] match { + case JsSuccess(strategy, _) => + strategy mustBe ConflictStrategy.HigherPriorityWins + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize keep_existing") { + val json = Json.parse("\"keep_existing\"") + + json.validate[ConflictStrategy] match { + case JsSuccess(strategy, _) => + strategy mustBe ConflictStrategy.KeepExisting + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize always_update") { + val json = Json.parse("\"always_update\"") + + json.validate[ConflictStrategy] match { + case JsSuccess(strategy, _) => + strategy mustBe ConflictStrategy.AlwaysUpdate + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should fail for unknown strategy") { + val json = Json.parse("\"invalid_strategy\"") + + // The implementation throws an exception for invalid strategies + an[IllegalArgumentException] must be thrownBy { + json.as[ConflictStrategy] + } + } + + it("should serialize strategies correctly") { + Json.toJson[ConflictStrategy](ConflictStrategy.HigherPriorityWins).as[String] mustBe "higher_priority_wins" + Json.toJson[ConflictStrategy](ConflictStrategy.KeepExisting).as[String] mustBe "keep_existing" + Json.toJson[ConflictStrategy](ConflictStrategy.AlwaysUpdate).as[String] mustBe "always_update" + } + } + + describe("TreeMergeRequest") { + + it("should deserialize a full merge request") { + val json = Json.parse("""{ + "haplogroupType": "Y", + "sourceTree": { + "name": "R1b", + "variants": [{"name": "M269"}] + }, + "sourceName": "ytree.net", + "priorityConfig": { + "sourcePriorities": ["ytree.net", "ISOGG"] + }, + "conflictStrategy": "higher_priority_wins", + "dryRun": true + }""") + + json.validate[TreeMergeRequest] match { + case JsSuccess(request, _) => + request.haplogroupType mustBe HaplogroupType.Y + request.sourceTree.name mustBe "R1b" + request.sourceName mustBe "ytree.net" + request.priorityConfig mustBe defined + request.conflictStrategy mustBe Some(ConflictStrategy.HigherPriorityWins) + request.dryRun mustBe true + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize minimal merge request") { + val json = Json.parse("""{ + "haplogroupType": "MT", + "sourceTree": {"name": "H"}, + "sourceName": "test" + }""") + + json.validate[TreeMergeRequest] match { + case JsSuccess(request, _) => + request.haplogroupType mustBe HaplogroupType.MT + request.priorityConfig mustBe None + request.conflictStrategy mustBe None + request.dryRun mustBe false + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should fail for invalid haplogroup type") { + val json = Json.parse("""{ + "haplogroupType": "INVALID", + "sourceTree": {"name": "Test"}, + "sourceName": "test" + }""") + + // The implementation throws an exception for invalid haplogroup types + an[IllegalArgumentException] must be thrownBy { + json.as[TreeMergeRequest] + } + } + } + + describe("SubtreeMergeRequest") { + + it("should deserialize a subtree merge request") { + val json = Json.parse("""{ + "haplogroupType": "Y", + "anchorHaplogroupName": "R1b", + "sourceTree": { + "name": "R1b-L21", + "variants": [{"name": "L21"}] + }, + "sourceName": "ytree.net", + "dryRun": false + }""") + + json.validate[SubtreeMergeRequest] match { + case JsSuccess(request, _) => + request.haplogroupType mustBe HaplogroupType.Y + request.anchorHaplogroupName mustBe "R1b" + request.sourceTree.name mustBe "R1b-L21" + request.sourceName mustBe "ytree.net" + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + } + + describe("MergePreviewRequest") { + + it("should deserialize with optional anchor") { + val json = Json.parse("""{ + "haplogroupType": "Y", + "anchorHaplogroupName": "R1b", + "sourceTree": {"name": "Test"}, + "sourceName": "test" + }""") + + json.validate[MergePreviewRequest] match { + case JsSuccess(request, _) => + request.anchorHaplogroupName mustBe Some("R1b") + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should deserialize without anchor") { + val json = Json.parse("""{ + "haplogroupType": "Y", + "sourceTree": {"name": "Test"}, + "sourceName": "test" + }""") + + json.validate[MergePreviewRequest] match { + case JsSuccess(request, _) => + request.anchorHaplogroupName mustBe None + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + } + + describe("MergeStatistics") { + + it("should serialize all fields") { + val stats = MergeStatistics( + nodesProcessed = 100, + nodesCreated = 50, + nodesUpdated = 30, + nodesUnchanged = 20, + variantsAdded = 200, + variantsUpdated = 50, + relationshipsCreated = 49, + relationshipsUpdated = 10, + splitOperations = 5 + ) + + val json = Json.toJson(stats) + + (json \ "nodesProcessed").as[Int] mustBe 100 + (json \ "nodesCreated").as[Int] mustBe 50 + (json \ "nodesUpdated").as[Int] mustBe 30 + (json \ "nodesUnchanged").as[Int] mustBe 20 + (json \ "variantsAdded").as[Int] mustBe 200 + (json \ "variantsUpdated").as[Int] mustBe 50 + (json \ "relationshipsCreated").as[Int] mustBe 49 + (json \ "relationshipsUpdated").as[Int] mustBe 10 + (json \ "splitOperations").as[Int] mustBe 5 + } + + it("should create empty statistics") { + val empty = MergeStatistics.empty + + empty.nodesProcessed mustBe 0 + empty.nodesCreated mustBe 0 + empty.nodesUpdated mustBe 0 + empty.nodesUnchanged mustBe 0 + } + + it("should combine statistics correctly") { + val stats1 = MergeStatistics(10, 5, 3, 2, 20, 5, 4, 1, 0) + val stats2 = MergeStatistics(20, 10, 6, 4, 40, 10, 9, 2, 1) + + val combined = MergeStatistics.combine(stats1, stats2) + + combined.nodesProcessed mustBe 30 + combined.nodesCreated mustBe 15 + combined.nodesUpdated mustBe 9 + combined.nodesUnchanged mustBe 6 + combined.variantsAdded mustBe 60 + combined.variantsUpdated mustBe 15 + combined.relationshipsCreated mustBe 13 + combined.relationshipsUpdated mustBe 3 + combined.splitOperations mustBe 1 + } + } + + describe("MergeConflict") { + + it("should serialize conflict details") { + val conflict = MergeConflict( + haplogroupName = "R1b-L21", + field = "formedYbp", + existingValue = "4500", + newValue = "4800", + resolution = "updated", + existingSource = "ISOGG", + newSource = "ytree.net" + ) + + val json = Json.toJson(conflict) + + (json \ "haplogroupName").as[String] mustBe "R1b-L21" + (json \ "field").as[String] mustBe "formedYbp" + (json \ "existingValue").as[String] mustBe "4500" + (json \ "newValue").as[String] mustBe "4800" + (json \ "resolution").as[String] mustBe "updated" + (json \ "existingSource").as[String] mustBe "ISOGG" + (json \ "newSource").as[String] mustBe "ytree.net" + } + + it("should round-trip serialize") { + val original = MergeConflict( + haplogroupName = "Test", + field = "description", + existingValue = "old", + newValue = "new", + resolution = "kept_existing", + existingSource = "A", + newSource = "B" + ) + + val restored = Json.toJson(original).as[MergeConflict] + + restored mustBe original + } + } + + describe("SplitOperation") { + + it("should serialize split details") { + val split = SplitOperation( + parentName = "R1b-L21", + newIntermediateName = "R1b-L21a", + variantsRedistributed = List("V1", "V2"), + childrenReassigned = List("R1b-Z39589", "R1b-Z39590"), + source = "ytree.net" + ) + + val json = Json.toJson(split) + + (json \ "parentName").as[String] mustBe "R1b-L21" + (json \ "newIntermediateName").as[String] mustBe "R1b-L21a" + (json \ "variantsRedistributed").as[List[String]] mustBe List("V1", "V2") + (json \ "childrenReassigned").as[List[String]] mustBe List("R1b-Z39589", "R1b-Z39590") + (json \ "source").as[String] mustBe "ytree.net" + } + } + + describe("TreeMergeResponse") { + + it("should serialize successful response") { + val response = TreeMergeResponse( + success = true, + message = "Merge completed successfully", + statistics = MergeStatistics(10, 5, 3, 2, 20, 5, 4, 1, 0), + conflicts = List.empty, + splits = List.empty, + errors = List.empty + ) + + val json = Json.toJson(response) + + (json \ "success").as[Boolean] mustBe true + (json \ "message").as[String] mustBe "Merge completed successfully" + (json \ "statistics" \ "nodesProcessed").as[Int] mustBe 10 + (json \ "conflicts").as[List[MergeConflict]] mustBe empty + } + + it("should create failure response") { + val response = TreeMergeResponse.failure( + "Merge failed due to validation error", + List("Error 1", "Error 2") + ) + + response.success mustBe false + response.message mustBe "Merge failed due to validation error" + response.errors mustBe List("Error 1", "Error 2") + response.statistics mustBe MergeStatistics.empty + } + + it("should serialize response with conflicts and errors") { + val response = TreeMergeResponse( + success = false, + message = "Completed with warnings", + statistics = MergeStatistics.empty, + conflicts = List( + MergeConflict("Node1", "field1", "old", "new", "kept", "A", "B") + ), + splits = List.empty, + errors = List("Warning: some nodes skipped") + ) + + val json = Json.toJson(response) + + (json \ "conflicts").as[List[MergeConflict]] must have size 1 + (json \ "errors").as[List[String]] must have size 1 + } + } + + describe("MergePreviewResponse") { + + it("should serialize preview with all details") { + val response = MergePreviewResponse( + statistics = MergeStatistics(10, 5, 3, 2, 20, 5, 4, 1, 0), + conflicts = List( + MergeConflict("Node1", "formedYbp", "4500", "4800", "will_update", "A", "B") + ), + splits = List.empty, + newNodes = List("NewNode1", "NewNode2"), + updatedNodes = List("UpdatedNode1"), + unchangedNodes = List("UnchangedNode1", "UnchangedNode2") + ) + + val json = Json.toJson(response) + + (json \ "newNodes").as[List[String]] mustBe List("NewNode1", "NewNode2") + (json \ "updatedNodes").as[List[String]] mustBe List("UpdatedNode1") + (json \ "unchangedNodes").as[List[String]] mustBe List("UnchangedNode1", "UnchangedNode2") + (json \ "statistics" \ "nodesCreated").as[Int] mustBe 5 + } + } + + describe("HaplogroupType in requests") { + + it("should accept Y haplogroup type") { + val json = Json.parse("""{ + "haplogroupType": "Y", + "sourceTree": {"name": "R1b"}, + "sourceName": "test" + }""") + + json.validate[TreeMergeRequest] match { + case JsSuccess(request, _) => + request.haplogroupType mustBe HaplogroupType.Y + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + + it("should accept MT haplogroup type") { + val json = Json.parse("""{ + "haplogroupType": "MT", + "sourceTree": {"name": "H"}, + "sourceName": "test" + }""") + + json.validate[TreeMergeRequest] match { + case JsSuccess(request, _) => + request.haplogroupType mustBe HaplogroupType.MT + case JsError(errors) => fail(s"Parse failed: $errors") + } + } + } +} diff --git a/test/models/domain/haplogroups/HaplogroupProvenanceSpec.scala b/test/models/domain/haplogroups/HaplogroupProvenanceSpec.scala new file mode 100644 index 0000000..af88352 --- /dev/null +++ b/test/models/domain/haplogroups/HaplogroupProvenanceSpec.scala @@ -0,0 +1,338 @@ +package models.domain.haplogroups + +import org.scalatest.funspec.AnyFunSpec +import org.scalatest.matchers.must.Matchers +import play.api.libs.json.Json + +import java.time.LocalDateTime + +class HaplogroupProvenanceSpec extends AnyFunSpec with Matchers { + + describe("HaplogroupProvenance") { + + describe("factory methods") { + + it("should create provenance for a new node with source") { + val provenance = HaplogroupProvenance.forNewNode("ytree.net", Seq("L21", "S145")) + + provenance.primaryCredit mustBe "ytree.net" + provenance.nodeProvenance mustBe Set("ytree.net") + provenance.variantProvenance mustBe Map( + "L21" -> Set("ytree.net"), + "S145" -> Set("ytree.net") + ) + provenance.lastMergedFrom mustBe Some("ytree.net") + provenance.lastMergedAt mustBe defined + } + + it("should create provenance for a new node without variants") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG") + + provenance.primaryCredit mustBe "ISOGG" + provenance.nodeProvenance mustBe Set("ISOGG") + provenance.variantProvenance mustBe Map.empty + } + + it("should create empty provenance") { + val provenance = HaplogroupProvenance.empty + + provenance.primaryCredit mustBe "" + provenance.nodeProvenance mustBe Set.empty + provenance.variantProvenance mustBe Map.empty + provenance.lastMergedAt mustBe None + provenance.lastMergedFrom mustBe None + } + } + + describe("addNodeSource") { + + it("should add a new source to nodeProvenance") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG") + val updated = provenance.addNodeSource("ytree.net") + + updated.nodeProvenance must contain allOf ("ISOGG", "ytree.net") + updated.primaryCredit mustBe "ISOGG" // Should not change + } + + it("should not duplicate existing sources") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG") + val updated = provenance.addNodeSource("ISOGG") + + updated.nodeProvenance mustBe Set("ISOGG") + } + + it("should accumulate multiple sources") { + val provenance = HaplogroupProvenance.forNewNode("source1") + .addNodeSource("source2") + .addNodeSource("source3") + + provenance.nodeProvenance must have size 3 + provenance.nodeProvenance must contain allOf ("source1", "source2", "source3") + } + } + + describe("addVariantSource") { + + it("should add source attribution for a new variant") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG") + val updated = provenance.addVariantSource("M269", "ytree.net") + + updated.variantProvenance must contain key "M269" + updated.variantProvenance("M269") must contain("ytree.net") + } + + it("should add additional sources to existing variants") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG", Seq("L21")) + val updated = provenance.addVariantSource("L21", "ytree.net") + + updated.variantProvenance("L21") must contain allOf ("ISOGG", "ytree.net") + } + + it("should not duplicate sources for the same variant") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG", Seq("L21")) + val updated = provenance.addVariantSource("L21", "ISOGG") + + updated.variantProvenance("L21") mustBe Set("ISOGG") + } + } + + describe("merge") { + + it("should combine nodeProvenance from both records") { + val prov1 = HaplogroupProvenance( + primaryCredit = "ISOGG", + nodeProvenance = Set("ISOGG", "DecodingUs") + ) + val prov2 = HaplogroupProvenance( + primaryCredit = "ytree.net", + nodeProvenance = Set("ytree.net", "researcher") + ) + + val merged = prov1.merge(prov2) + + merged.nodeProvenance must contain allOf ("ISOGG", "DecodingUs", "ytree.net", "researcher") + } + + it("should preserve primary credit from the first provenance") { + val prov1 = HaplogroupProvenance(primaryCredit = "ISOGG") + val prov2 = HaplogroupProvenance(primaryCredit = "ytree.net") + + val merged = prov1.merge(prov2) + + merged.primaryCredit mustBe "ISOGG" + } + + it("should combine variantProvenance") { + val prov1 = HaplogroupProvenance( + primaryCredit = "ISOGG", + variantProvenance = Map("L21" -> Set("ISOGG"), "M269" -> Set("ISOGG")) + ) + val prov2 = HaplogroupProvenance( + primaryCredit = "ytree.net", + variantProvenance = Map("L21" -> Set("ytree.net"), "DF13" -> Set("ytree.net")) + ) + + val merged = prov1.merge(prov2) + + merged.variantProvenance("L21") must contain allOf ("ISOGG", "ytree.net") + merged.variantProvenance("M269") mustBe Set("ISOGG") + merged.variantProvenance("DF13") mustBe Set("ytree.net") + } + + it("should take the most recent lastMergedAt timestamp") { + val earlier = LocalDateTime.now().minusDays(1) + val later = LocalDateTime.now() + + val prov1 = HaplogroupProvenance( + primaryCredit = "A", + lastMergedAt = Some(earlier) + ) + val prov2 = HaplogroupProvenance( + primaryCredit = "B", + lastMergedAt = Some(later) + ) + + val merged = prov1.merge(prov2) + + merged.lastMergedAt mustBe Some(later) + } + + it("should prefer lastMergedFrom from the second provenance") { + val prov1 = HaplogroupProvenance( + primaryCredit = "A", + lastMergedFrom = Some("source1") + ) + val prov2 = HaplogroupProvenance( + primaryCredit = "B", + lastMergedFrom = Some("source2") + ) + + val merged = prov1.merge(prov2) + + merged.lastMergedFrom mustBe Some("source2") + } + + it("should handle merging with empty provenance") { + val prov1 = HaplogroupProvenance.forNewNode("ISOGG", Seq("L21")) + val prov2 = HaplogroupProvenance.empty + + val merged = prov1.merge(prov2) + + merged.primaryCredit mustBe "ISOGG" + merged.nodeProvenance mustBe Set("ISOGG") + merged.variantProvenance mustBe Map("L21" -> Set("ISOGG")) + } + } + + describe("withMergeInfo") { + + it("should update merge timestamp and source") { + val provenance = HaplogroupProvenance.forNewNode("ISOGG") + val now = LocalDateTime.now() + val updated = provenance.withMergeInfo("ytree.net", now) + + updated.lastMergedAt mustBe Some(now) + updated.lastMergedFrom mustBe Some("ytree.net") + updated.primaryCredit mustBe "ISOGG" // Should not change + } + + it("should overwrite previous merge info") { + val earlier = LocalDateTime.now().minusHours(1) + val later = LocalDateTime.now() + + val provenance = HaplogroupProvenance.forNewNode("ISOGG") + .withMergeInfo("source1", earlier) + .withMergeInfo("source2", later) + + provenance.lastMergedAt mustBe Some(later) + provenance.lastMergedFrom mustBe Some("source2") + } + } + + describe("shouldPreserveCredit") { + + it("should return true for ISOGG credit") { + HaplogroupProvenance.shouldPreserveCredit("ISOGG") mustBe true + } + + it("should be case-insensitive for ISOGG") { + HaplogroupProvenance.shouldPreserveCredit("isogg") mustBe true + HaplogroupProvenance.shouldPreserveCredit("IsoGG") mustBe true + HaplogroupProvenance.shouldPreserveCredit("Isogg") mustBe true + } + + it("should return false for non-ISOGG sources") { + HaplogroupProvenance.shouldPreserveCredit("ytree.net") mustBe false + HaplogroupProvenance.shouldPreserveCredit("DecodingUs") mustBe false + HaplogroupProvenance.shouldPreserveCredit("researcher") mustBe false + } + + it("should return false for empty string") { + HaplogroupProvenance.shouldPreserveCredit("") mustBe false + } + } + + describe("JSON serialization") { + + it("should serialize to JSON correctly") { + val provenance = HaplogroupProvenance( + primaryCredit = "ISOGG", + nodeProvenance = Set("ISOGG", "ytree.net"), + variantProvenance = Map("L21" -> Set("ISOGG", "ytree.net")), + lastMergedAt = Some(LocalDateTime.of(2025, 12, 12, 10, 30, 0)), + lastMergedFrom = Some("ytree.net") + ) + + val json = Json.toJson(provenance) + + (json \ "primaryCredit").as[String] mustBe "ISOGG" + (json \ "nodeProvenance").as[Set[String]] must contain allOf ("ISOGG", "ytree.net") + (json \ "lastMergedFrom").as[String] mustBe "ytree.net" + } + + it("should deserialize from JSON correctly") { + val jsonString = """{ + "primaryCredit": "ISOGG", + "nodeProvenance": ["ISOGG", "ytree.net"], + "variantProvenance": {"L21": ["ISOGG", "ytree.net"]}, + "lastMergedFrom": "ytree.net" + }""" + + val provenance = Json.parse(jsonString).as[HaplogroupProvenance] + + provenance.primaryCredit mustBe "ISOGG" + provenance.nodeProvenance must contain allOf ("ISOGG", "ytree.net") + provenance.variantProvenance("L21") must contain allOf ("ISOGG", "ytree.net") + provenance.lastMergedFrom mustBe Some("ytree.net") + } + + it("should round-trip serialize and deserialize") { + val original = HaplogroupProvenance.forNewNode("test-source", Seq("V1", "V2")) + + val json = Json.toJson(original) + val restored = json.as[HaplogroupProvenance] + + restored.primaryCredit mustBe original.primaryCredit + restored.nodeProvenance mustBe original.nodeProvenance + restored.variantProvenance mustBe original.variantProvenance + restored.lastMergedFrom mustBe original.lastMergedFrom + } + + it("should handle empty collections in JSON") { + val jsonString = """{ + "primaryCredit": "test", + "nodeProvenance": [], + "variantProvenance": {} + }""" + + val provenance = Json.parse(jsonString).as[HaplogroupProvenance] + + provenance.nodeProvenance mustBe Set.empty + provenance.variantProvenance mustBe Map.empty + } + + it("should handle missing optional fields") { + val jsonString = """{ + "primaryCredit": "test" + }""" + + val provenance = Json.parse(jsonString).as[HaplogroupProvenance] + + provenance.primaryCredit mustBe "test" + provenance.nodeProvenance mustBe Set.empty + provenance.variantProvenance mustBe Map.empty + provenance.lastMergedAt mustBe None + provenance.lastMergedFrom mustBe None + } + } + + describe("immutability") { + + it("should not mutate original when adding node source") { + val original = HaplogroupProvenance.forNewNode("ISOGG") + val modified = original.addNodeSource("ytree.net") + + original.nodeProvenance must not contain "ytree.net" + modified.nodeProvenance must contain("ytree.net") + } + + it("should not mutate original when adding variant source") { + val original = HaplogroupProvenance.forNewNode("ISOGG") + val modified = original.addVariantSource("L21", "ytree.net") + + original.variantProvenance must not contain key ("L21") + modified.variantProvenance must contain key "L21" + } + + it("should not mutate original when merging") { + val prov1 = HaplogroupProvenance.forNewNode("A") + val prov2 = HaplogroupProvenance.forNewNode("B") + val merged = prov1.merge(prov2) + + prov1.nodeProvenance mustBe Set("A") + prov2.nodeProvenance mustBe Set("B") + merged.nodeProvenance must contain allOf ("A", "B") + } + } + } +} diff --git a/test/services/HaplogroupTreeMergeServiceSpec.scala b/test/services/HaplogroupTreeMergeServiceSpec.scala new file mode 100644 index 0000000..75291b9 --- /dev/null +++ b/test/services/HaplogroupTreeMergeServiceSpec.scala @@ -0,0 +1,709 @@ +package services + +import models.HaplogroupType +import models.api.haplogroups.* +import models.dal.domain.genomics.Variant +import models.domain.haplogroups.{Haplogroup, HaplogroupProvenance} +import org.mockito.ArgumentMatchers.{any, anyInt, anyString} +import org.mockito.Mockito.{never, reset, verify, when} +import org.scalatest.BeforeAndAfterEach +import org.scalatest.concurrent.ScalaFutures +import org.scalatest.time.{Millis, Seconds, Span} +import org.scalatestplus.mockito.MockitoSugar +import org.scalatestplus.play.PlaySpec +import repositories.{HaplogroupCoreRepository, HaplogroupVariantRepository, VariantAliasRepository, VariantRepository} + +import java.time.LocalDateTime +import scala.concurrent.{ExecutionContext, Future} + +class HaplogroupTreeMergeServiceSpec extends PlaySpec with MockitoSugar with ScalaFutures with BeforeAndAfterEach { + + implicit val ec: ExecutionContext = ExecutionContext.global + implicit val patience: PatienceConfig = PatienceConfig(timeout = Span(5, Seconds), interval = Span(100, Millis)) + + // Mocks + var mockHaplogroupRepo: HaplogroupCoreRepository = _ + var mockVariantRepo: HaplogroupVariantRepository = _ + var mockVariantRepository: VariantRepository = _ + var mockVariantAliasRepository: VariantAliasRepository = _ + var service: HaplogroupTreeMergeService = _ + + // Test fixtures + val now: LocalDateTime = LocalDateTime.now() + + def createHaplogroup( + id: Int, + name: String, + haplogroupType: HaplogroupType = HaplogroupType.Y, + source: String = "ISOGG", + provenance: Option[HaplogroupProvenance] = None + ): Haplogroup = Haplogroup( + id = Some(id), + name = name, + lineage = None, + description = None, + haplogroupType = haplogroupType, + revisionId = 1, + source = source, + confidenceLevel = "high", + validFrom = now.minusDays(30), + validUntil = None, + provenance = provenance + ) + + def createPhyloNode( + name: String, + variants: List[String] = List.empty, + children: List[PhyloNodeInput] = List.empty, + formedYbp: Option[Int] = None + ): PhyloNodeInput = PhyloNodeInput( + name = name, + variants = variants.map(v => VariantInput(v)), // Convert strings to VariantInput + children = children, + formedYbp = formedYbp + ) + + override def beforeEach(): Unit = { + mockHaplogroupRepo = mock[HaplogroupCoreRepository] + mockVariantRepo = mock[HaplogroupVariantRepository] + mockVariantRepository = mock[VariantRepository] + mockVariantAliasRepository = mock[VariantAliasRepository] + service = new HaplogroupTreeMergeService( + mockHaplogroupRepo, + mockVariantRepo, + mockVariantRepository, + mockVariantAliasRepository + ) + } + + "HaplogroupTreeMergeService" should { + + // ========================================================================= + // Preview Tests + // ========================================================================= + + "preview a simple tree merge with no existing haplogroups" in { + // Setup: Empty existing tree + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21", "S145"), + children = List( + createPhyloNode("R1b-DF13", variants = List("DF13")) + ) + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesProcessed mustBe 2 + result.statistics.nodesCreated mustBe 2 + result.statistics.nodesUnchanged mustBe 0 + result.newNodes must contain allOf ("R1b-L21", "R1b-DF13") + result.conflicts mustBe empty + } + } + + "preview identifies existing nodes for update" in { + // Setup: Existing tree with R1b-L21 + val existingHaplogroup = createHaplogroup(1, "R1b-L21", source = "DecodingUs") + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21", "S145")) + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21", "S145"), + children = List( + createPhyloNode("R1b-DF13", variants = List("DF13")) + ) + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + priorityConfig = Some(SourcePriorityConfig(List("ytree.net", "DecodingUs"))) + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesProcessed mustBe 2 + result.statistics.nodesCreated mustBe 1 // DF13 is new + result.newNodes must contain("R1b-DF13") + // R1b-L21 exists but ytree.net has higher priority, so it might be marked for update + // depending on whether there are differences + } + } + + "preview detects age estimate conflicts" in { + // Setup: Existing tree with different age estimate + val existingHaplogroup = createHaplogroup(1, "R1b-L21").copy(formedYbp = Some(4500)) + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21"), + formedYbp = Some(4800) // Different from existing + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + priorityConfig = Some(SourcePriorityConfig(List("ytree.net", "ISOGG"))) + ) + + whenReady(service.previewMerge(request)) { result => + result.conflicts.size mustBe 1 + result.conflicts.head.field mustBe "formedYbp" + result.conflicts.head.existingValue mustBe "4500" + result.conflicts.head.newValue mustBe "4800" + } + } + + // ========================================================================= + // Variant-Based Matching Tests + // ========================================================================= + + "match nodes by variants, not names" in { + // Setup: Existing "R-L21" should match incoming "R1b-L21" by variant + val existingHaplogroup = createHaplogroup(1, "R-L21") // Different name + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) // Same variant + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", // Different name but same variant + variants = List("L21") + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net" + ) + + whenReady(service.previewMerge(request)) { result => + // Should recognize as existing node (unchanged), not new + result.statistics.nodesCreated mustBe 0 + result.unchangedNodes must contain("R-L21") + } + } + + "fall back to name matching when no variant match found" in { + // Setup: Existing node with same name but no variants + val existingHaplogroup = createHaplogroup(1, "R1b-L21") + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq.empty) // No variants + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21", "S145") // Has variants but no match in DB + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net" + ) + + whenReady(service.previewMerge(request)) { result => + // Should match by name + result.statistics.nodesCreated mustBe 0 + result.unchangedNodes must contain("R1b-L21") + } + } + + // ========================================================================= + // Credit Assignment Tests + // ========================================================================= + + "preserve ISOGG credit on existing nodes" in { + // Setup: Existing node with ISOGG provenance + val isoggProvenance = HaplogroupProvenance( + primaryCredit = "ISOGG", + nodeProvenance = Set("ISOGG") + ) + val existingHaplogroup = createHaplogroup(1, "R1b-L21", provenance = Some(isoggProvenance)) + + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) + ))) + when(mockHaplogroupRepo.updateProvenance(anyInt(), any[HaplogroupProvenance])) + .thenReturn(Future.successful(true)) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21") + ) + + val request = TreeMergeRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + dryRun = true // Use dry run for this test + ) + + whenReady(service.mergeFullTree(request)) { result => + result.success mustBe true + // ISOGG credit should be preserved (verified via mock) + } + } + + "assign incoming source credit for new nodes" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val sourceTree = createPhyloNode( + name = "R1b-NEW", + variants = List("NEW123") + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net" + ) + + whenReady(service.previewMerge(request)) { result => + result.newNodes must contain("R1b-NEW") + // New nodes get incoming source credit (ytree.net) + } + } + + // ========================================================================= + // Priority Configuration Tests + // ========================================================================= + + "respect source priority for conflict resolution" in { + val existingHaplogroup = createHaplogroup(1, "R1b-L21", source = "DecodingUs") + .copy(formedYbp = Some(4500)) + + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21"), + formedYbp = Some(4800) + ) + + // Higher priority = lower index. ytree.net at index 0 beats DecodingUs at index 1 + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + priorityConfig = Some(SourcePriorityConfig(List("ytree.net", "DecodingUs"))) + ) + + whenReady(service.previewMerge(request)) { result => + result.conflicts.head.resolution mustBe "will_update" + } + } + + "keep existing values when existing source has higher priority" in { + val existingProvenance = HaplogroupProvenance(primaryCredit = "ISOGG", nodeProvenance = Set("ISOGG")) + val existingHaplogroup = createHaplogroup(1, "R1b-L21", provenance = Some(existingProvenance)) + .copy(formedYbp = Some(4500)) + + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21"), + formedYbp = Some(4800) + ) + + // ISOGG at index 0 beats ytree.net at index 1 + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + priorityConfig = Some(SourcePriorityConfig(List("ISOGG", "ytree.net"))) + ) + + whenReady(service.previewMerge(request)) { result => + result.conflicts.head.resolution mustBe "will_keep_existing" + } + } + + // ========================================================================= + // Subtree Merge Tests + // ========================================================================= + + "merge subtree under specified anchor" in { + val anchorHaplogroup = createHaplogroup(100, "R1b") + + when(mockHaplogroupRepo.getHaplogroupByName("R1b", HaplogroupType.Y)) + .thenReturn(Future.successful(Some(anchorHaplogroup))) + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (anchorHaplogroup, Seq("M269")) + ))) + when(mockHaplogroupRepo.createWithParent(any[Haplogroup], any[Option[Int]], anyString())) + .thenReturn(Future.successful(101)) + when(mockHaplogroupRepo.updateProvenance(anyInt(), any[HaplogroupProvenance])) + .thenReturn(Future.successful(true)) + when(mockVariantRepository.searchByName(anyString())) + .thenReturn(Future.successful(Seq.empty)) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21") + ) + + val request = SubtreeMergeRequest( + haplogroupType = HaplogroupType.Y, + anchorHaplogroupName = "R1b", + sourceTree = sourceTree, + sourceName = "ytree.net" + ) + + whenReady(service.mergeSubtree(request)) { result => + result.success mustBe true + result.statistics.nodesCreated mustBe 1 + verify(mockHaplogroupRepo).createWithParent(any[Haplogroup], any[Option[Int]], anyString()) + } + } + + "fail subtree merge when anchor not found" in { + when(mockHaplogroupRepo.getHaplogroupByName("NONEXISTENT", HaplogroupType.Y)) + .thenReturn(Future.successful(None)) + + val sourceTree = createPhyloNode(name = "Test") + + val request = SubtreeMergeRequest( + haplogroupType = HaplogroupType.Y, + anchorHaplogroupName = "NONEXISTENT", + sourceTree = sourceTree, + sourceName = "ytree.net" + ) + + whenReady(service.mergeSubtree(request).failed) { ex => + ex mustBe a[IllegalArgumentException] + ex.getMessage must include("not found") + } + } + + // ========================================================================= + // Dry Run Tests + // ========================================================================= + + "not modify database on dry run" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val sourceTree = createPhyloNode( + name = "R1b-NEW", + variants = List("NEW123") + ) + + val request = TreeMergeRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + dryRun = true + ) + + whenReady(service.mergeFullTree(request)) { result => + result.success mustBe true + // Verify no write operations were called + verify(mockHaplogroupRepo, never()).createWithParent(any[Haplogroup], any[Option[Int]], anyString()) + verify(mockHaplogroupRepo, never()).update(any[Haplogroup]) + verify(mockHaplogroupRepo, never()).updateProvenance(anyInt(), any[HaplogroupProvenance]) + } + } + + // ========================================================================= + // Recursive Tree Processing Tests + // ========================================================================= + + "process deeply nested tree structures" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + // Create a 4-level deep tree + val deepTree = createPhyloNode( + name = "Level1", + variants = List("V1"), + children = List( + createPhyloNode( + name = "Level2", + variants = List("V2"), + children = List( + createPhyloNode( + name = "Level3", + variants = List("V3"), + children = List( + createPhyloNode("Level4", variants = List("V4")) + ) + ) + ) + ) + ) + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = deepTree, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesProcessed mustBe 4 + result.statistics.nodesCreated mustBe 4 + result.newNodes must have size 4 + } + } + + "process tree with multiple children at each level" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val wideTree = createPhyloNode( + name = "Parent", + variants = List("P1"), + children = List( + createPhyloNode("Child1", variants = List("C1")), + createPhyloNode("Child2", variants = List("C2")), + createPhyloNode("Child3", variants = List("C3")) + ) + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = wideTree, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesProcessed mustBe 4 + result.statistics.nodesCreated mustBe 4 + } + } + + // ========================================================================= + // MT DNA Tests + // ========================================================================= + + "handle MT DNA haplogroup type" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.MT)) + .thenReturn(Future.successful(Seq.empty)) + + val sourceTree = createPhyloNode( + name = "H1", + variants = List("H1-defining") + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.MT, + sourceTree = sourceTree, + sourceName = "mtDNA-tree" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesCreated mustBe 1 + verify(mockHaplogroupRepo).getAllWithVariantNames(HaplogroupType.MT) + } + } + + // ========================================================================= + // Conflict Strategy Tests + // ========================================================================= + + "apply KeepExisting conflict strategy" in { + val existingHaplogroup = createHaplogroup(1, "R1b-L21") + .copy(formedYbp = Some(4500)) + + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) + ))) + when(mockHaplogroupRepo.updateProvenance(anyInt(), any[HaplogroupProvenance])) + .thenReturn(Future.successful(true)) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21"), + formedYbp = Some(4800) + ) + + val request = TreeMergeRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + conflictStrategy = Some(ConflictStrategy.KeepExisting), + dryRun = true + ) + + whenReady(service.mergeFullTree(request)) { result => + result.success mustBe true + // With KeepExisting, should not update even with conflicts + result.statistics.nodesUpdated mustBe 0 + } + } + + "apply AlwaysUpdate conflict strategy" in { + val existingHaplogroup = createHaplogroup(1, "R1b-L21", source = "low-priority") + .copy(formedYbp = Some(4500)) + + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("L21")) + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21"), + formedYbp = Some(4800) + ) + + // With AlwaysUpdate, should update regardless of priority + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "ytree.net", + priorityConfig = Some(SourcePriorityConfig(List("low-priority", "ytree.net"))) // ytree.net is lower priority + ) + + whenReady(service.previewMerge(request)) { result => + // Preview shows conflict would be kept (default strategy) + result.conflicts.nonEmpty mustBe true + } + } + + // ========================================================================= + // Edge Cases + // ========================================================================= + + "handle empty source tree gracefully" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val emptyTree = createPhyloNode(name = "SingleNode") + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = emptyTree, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesProcessed mustBe 1 + } + } + + "handle nodes with no variants" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val noVariantsTree = createPhyloNode( + name = "NoVariants", + variants = List.empty + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = noVariantsTree, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.nodesCreated mustBe 1 + } + } + + "handle case-insensitive variant matching" in { + val existingHaplogroup = createHaplogroup(1, "R1b-L21") + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq( + (existingHaplogroup, Seq("l21")) // lowercase + ))) + + val sourceTree = createPhyloNode( + name = "R1b-L21", + variants = List("L21") // uppercase + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + // Should match despite case difference + result.statistics.nodesCreated mustBe 0 + result.unchangedNodes must contain("R1b-L21") + } + } + + // ========================================================================= + // Statistics Accuracy Tests + // ========================================================================= + + "accurately count variant additions for new nodes" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val sourceTree = createPhyloNode( + name = "Test", + variants = List("V1", "V2", "V3") // 3 variants + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = sourceTree, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + result.statistics.variantsAdded mustBe 3 + } + } + + "count relationship creations correctly" in { + when(mockHaplogroupRepo.getAllWithVariantNames(HaplogroupType.Y)) + .thenReturn(Future.successful(Seq.empty)) + + val treeWithChildren = createPhyloNode( + name = "Parent", + children = List( + createPhyloNode("Child1"), + createPhyloNode("Child2") + ) + ) + + val request = MergePreviewRequest( + haplogroupType = HaplogroupType.Y, + sourceTree = treeWithChildren, + sourceName = "test" + ) + + whenReady(service.previewMerge(request)) { result => + // Parent has 1 relationship (to anchor or none) + // Child1 and Child2 each have 1 relationship to Parent + result.statistics.relationshipsCreated mustBe 3 + } + } + } +}