From 756244098d82e43d60a5fac47a01c8de06ef8451 Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Mon, 29 Jun 2026 23:39:52 -0400 Subject: [PATCH] fix: locateVariants/predictCoding no longer silently drop large INDELs (#81) Large INDELs (e.g. 2542bp deletions) that span multiple exons were silently dropped by locateVariants() and predictCoding() because the internal call to mapToTranscripts() uses type='within', which requires the variant to fit entirely within a single exon/CDS element. Changes: - locateVariants (.makeResult): after mapToTranscripts, detect variants that overlap the transcript (type='any') but were not mapped. These are now included in results with LOCATION='coding' and LOCSTART/LOCEND set to NA. A warning is emitted identifying the rescued variants. - predictCoding (.localCoordinates): emit a warning when variants overlap CDS regions but cannot be mapped to transcript coordinates, directing users to locateVariants() for identification. This is the minimal fix that prevents silent data loss. Full amino acid consequence prediction for multi-exon spanning variants would require changes to GenomicFeatures::mapToTranscripts itself. Fixes #81 --- R/AllUtilities.R | 31 ++++++++ R/methods-locateVariants.R | 113 ++++++++++++++++++++++++----- inst/unitTests/test_large_indels.R | 103 ++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 17 deletions(-) create mode 100644 inst/unitTests/test_large_indels.R diff --git a/R/AllUtilities.R b/R/AllUtilities.R index e0478c2..a86aa99 100644 --- a/R/AllUtilities.R +++ b/R/AllUtilities.R @@ -281,6 +281,19 @@ ## 'to' is a GRangesList of cds by transcript map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...) if (length(map) == 0) { + ## Check if any variants overlap coding regions but couldn't be mapped + ## (large INDELs spanning multiple exons — issue #81) + fo <- findOverlaps(from, to, type="any", + ignore.strand=ignore.strand) + if (length(fo) > 0L) { + n_dropped <- length(unique(queryHits(fo))) + warning(n_dropped, + " variant(s) overlap coding regions but span multiple ", + "exons and could not be mapped to transcript coordinates. ", + "These are not included in predictCoding() results. ", + "Consider using locateVariants() to identify them.", + call.=FALSE) + } res <- GRanges() mcols(res) <- DataFrame(REF=DNAStringSet(), ALT=DNAStringSetList(), varAllele=DNAStringSet(), CDSLOC=IRanges(), @@ -291,6 +304,24 @@ xHits <- map$xHits txHits <- map$transcriptsHits + + ## Warn about variants that overlap CDS but weren't mapped (issue #81) + all_idx <- seq_along(from) + dropped <- setdiff(all_idx, unique(xHits)) + if (length(dropped) > 0L) { + fo <- findOverlaps(from[dropped], to, type="any", + ignore.strand=ignore.strand) + if (length(fo) > 0L) { + n_dropped <- length(unique(queryHits(fo))) + warning(n_dropped, + " variant(s) overlap coding regions but span multiple ", + "exons and could not be mapped to transcript coordinates. ", + "These are not included in predictCoding() results. ", + "Consider using locateVariants() to identify them.", + call.=FALSE) + } + } + flat_to <- unlist(to) ## names needed for mapping ## FIXME: cdsid is expensive diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R index b85e287..aed781b 100644 --- a/R/methods-locateVariants.R +++ b/R/methods-locateVariants.R @@ -545,14 +545,50 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), map <- mapToTranscripts(unname(query), subject, ignore.strand=ignore.strand) - if (length(map) > 0) { - xHits <- map$xHits - txHits <- map$transcriptsHits - tx <- names(subject)[txHits] - if (!is.null(tx)) - txid <- tx - else - txid <- NA_integer_ + + ## Rescue variants that overlap the transcript/CDS but were dropped + ## by mapToTranscripts due to the 'within' overlap requirement. + ## This handles large INDELs spanning multiple exons (issue #81). + mapped_idx <- integer(0) + if (length(map) > 0) + mapped_idx <- unique(map$xHits) + dropped <- setdiff(seq_along(query), mapped_idx) + rescued <- integer(0) + rescued_tx <- integer(0) + if (length(dropped) > 0L) { + ## Use type="any" to find variants that overlap the subject features + fo <- findOverlaps(query[dropped], subject, type="any", + ignore.strand=ignore.strand) + if (length(fo) > 0L) { + rescued <- dropped[queryHits(fo)] + rescued_tx <- subjectHits(fo) + ## De-duplicate: keep first transcript hit per query + dup <- duplicated(rescued) + rescued <- rescued[!dup] + rescued_tx <- rescued_tx[!dup] + warning(length(rescued), + " variant(s) span multiple exons/CDS and could not be ", + "mapped to transcript coordinates. These are included in ", + "results but LOCSTART/LOCEND are set to NA.", + call.=FALSE) + } + } + + if (length(map) > 0 || length(rescued) > 0L) { + ## --- Standard mapped results --- + if (length(map) > 0) { + xHits <- map$xHits + txHits <- map$transcriptsHits + tx <- names(subject)[txHits] + if (!is.null(tx)) + txid <- tx + else + txid <- NA_integer_ + } else { + xHits <- integer(0) + txHits <- integer(0) + txid <- character(0) + } ## FIXME: cdsid is expensive cdsid <- IntegerList(integer(0)) ## CodingVariants() must fall within a coding region. @@ -563,7 +599,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), ## 'map' is a GRangesList and in 'map2' it's unlisted.) ## Only ranges identified by 'map' and 'map2' are kept. ## Ranges identified by 'map' only are discarded. - if (vtype == "coding") { + if (vtype == "coding" && length(map) > 0) { usub <- unlist(subject) ## names needed for mapping map2 <- mapToTranscripts(unname(query)[xHits], usub, ignore.strand=ignore.strand) @@ -581,14 +617,17 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), } } - ss <- runValue(strand(subject)[txHits]) - if (any(elementNROWS(ss) > 1L)) { - warning("'subject' has multiple strands per list element; ", - "setting strand to '*'") - sstrand <- Rle("*", length(txHits)) - } - sstrand <- unlist(ss, use.names=FALSE) - GRanges(seqnames=seqnames(query)[xHits], + ## Build result for standard mapped variants + if (length(xHits) > 0L) { + ss <- runValue(strand(subject)[txHits]) + if (any(elementNROWS(ss) > 1L)) { + warning("'subject' has multiple strands per list element; ", + "setting strand to '*'") + sstrand <- Rle("*", length(txHits)) + } + sstrand <- unlist(ss, use.names=FALSE) + mapped_result <- GRanges( + seqnames=seqnames(query)[xHits], ranges=IRanges(ranges(query)[xHits]), strand=sstrand, LOCATION=.location(length(xHits), vtype), @@ -600,6 +639,46 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), GENEID=NA_character_, PRECEDEID=CharacterList(character(0)), FOLLOWID=CharacterList(character(0))) + } else { + mapped_result <- GRanges() + mcols(mapped_result) <- DataFrame( + LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(), + QUERYID=integer(), TXID=integer(), CDSID=IntegerList(), + GENEID=character(), PRECEDEID=CharacterList(), + FOLLOWID=CharacterList()) + } + + ## Build result for rescued multi-exon-spanning variants + if (length(rescued) > 0L) { + rtx <- names(subject)[rescued_tx] + if (is.null(rtx)) + rtx <- rep(NA_integer_, length(rescued)) + ## Use strand from the subject transcript + rss <- runValue(strand(subject)[rescued_tx]) + rsstrand <- unlist(rss, use.names=FALSE) + rescued_result <- GRanges( + seqnames=seqnames(query)[rescued], + ranges=IRanges(ranges(query)[rescued]), + strand=rsstrand, + LOCATION=.location(length(rescued), vtype), + LOCSTART=rep(NA_integer_, length(rescued)), + LOCEND=rep(NA_integer_, length(rescued)), + QUERYID=rescued, + TXID=rtx, + CDSID=IntegerList(rep(list(integer(0)), length(rescued))), + GENEID=NA_character_, + PRECEDEID=CharacterList(rep(list(character(0)), length(rescued))), + FOLLOWID=CharacterList(rep(list(character(0)), length(rescued)))) + } else { + rescued_result <- GRanges() + mcols(rescued_result) <- DataFrame( + LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(), + QUERYID=integer(), TXID=integer(), CDSID=IntegerList(), + GENEID=character(), PRECEDEID=CharacterList(), + FOLLOWID=CharacterList()) + } + + c(mapped_result, rescued_result) } else { res <- GRanges() mcols(res) <- DataFrame(LOCATION=.location(), diff --git a/inst/unitTests/test_large_indels.R b/inst/unitTests/test_large_indels.R new file mode 100644 index 0000000..5736f54 --- /dev/null +++ b/inst/unitTests/test_large_indels.R @@ -0,0 +1,103 @@ +## Tests for issue #81: locateVariants() and predictCoding() silently drop +## large INDELs that span multiple exons. + +## Create a synthetic transcript with 3 exons separated by introns: +## Exon 1: 1000-1200 (201 bp) +## Intron: 1201-1499 +## Exon 2: 1500-1700 (201 bp) +## Intron: 1701-1999 +## Exon 3: 2000-2200 (201 bp) +## +## A 2542bp deletion starting at position 1000 will span all 3 exons. + +cdsbytx <- GRangesList( + tx1 = GRanges(seqnames = "chr1", + ranges = IRanges(start = c(1000, 1500, 2000), + end = c(1200, 1700, 2200)), + strand = "+", + cds_id = c(1L, 2L, 3L), + exon_rank = c(1L, 2L, 3L)) +) + +test_locateVariants_large_indel_not_dropped <- function() +{ + ## A deletion spanning all 3 exons (width > any single exon) + ## This was previously silently dropped by mapToTranscripts. + query <- GRanges("chr1", IRanges(start = 1000, width = 2543)) + + ## locateVariants with CodingVariants should now include this variant + ## (previously it returned 0 rows) + result <- withCallingHandlers( + VariantAnnotation:::.makeResult(query, cdsbytx, "coding", + ignore.strand = TRUE, asHits = FALSE), + warning = function(w) { + ## Verify that a warning about multi-exon spanning is emitted + checkTrue(grepl("span multiple exons", conditionMessage(w))) + invokeRestart("muffleWarning") + } + ) + + ## The variant should appear in results + checkTrue(length(result) > 0L) + ## QUERYID should reference our variant (index 1) + checkTrue(1L %in% mcols(result)$QUERYID) + ## LOCATION should be "coding" + checkTrue(all(mcols(result)$LOCATION == "coding")) + ## LOCSTART/LOCEND should be NA (can't map to transcript coords) + rescued <- result[mcols(result)$QUERYID == 1L] + checkTrue(all(is.na(mcols(rescued)$LOCSTART))) + checkTrue(all(is.na(mcols(rescued)$LOCEND))) +} + +test_locateVariants_small_indel_still_works <- function() +{ + ## A small variant within a single exon should still work normally + query <- GRanges("chr1", IRanges(start = 1050, width = 10)) + result <- VariantAnnotation:::.makeResult(query, cdsbytx, "coding", + ignore.strand = TRUE, + asHits = FALSE) + ## Should find the variant + checkTrue(length(result) > 0L) + ## LOCSTART/LOCEND should NOT be NA (normal mapping works) + checkTrue(!any(is.na(mcols(result)$LOCSTART))) +} + +test_predictCoding_large_indel_warning <- function() +{ + ## A large deletion spanning multiple exons should emit a warning + ## from .localCoordinates, not silently return empty. + query <- GRanges("chr1", IRanges(start = 1000, width = 2543)) + + warned <- FALSE + result <- withCallingHandlers( + VariantAnnotation:::.localCoordinates(query, cdsbytx, + ignore.strand = TRUE), + warning = function(w) { + if (grepl("span multiple exons", conditionMessage(w))) { + warned <<- TRUE + invokeRestart("muffleWarning") + } + } + ) + + ## A warning should have been emitted + checkTrue(warned) +} + +test_locateVariants_mixed_variants <- function() +{ + ## Mix of a small variant (fits in one exon) and a large spanning deletion. + ## Both should appear in results. + query <- GRanges("chr1", + IRanges(start = c(1050, 1000), width = c(10, 2543))) + + result <- suppressWarnings( + VariantAnnotation:::.makeResult(query, cdsbytx, "coding", + ignore.strand = TRUE, asHits = FALSE) + ) + + ## Both variants should be in results + checkTrue(1L %in% mcols(result)$QUERYID) + checkTrue(2L %in% mcols(result)$QUERYID) + checkIdentical(length(unique(mcols(result)$QUERYID)), 2L) +}