diff --git a/R/AllUtilities.R b/R/AllUtilities.R index e0478c2..a86aa99 100644 --- a/R/AllUtilities.R +++ b/R/AllUtilities.R @@ -281,6 +281,19 @@ ## 'to' is a GRangesList of cds by transcript map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...) if (length(map) == 0) { + ## Check if any variants overlap coding regions but couldn't be mapped + ## (large INDELs spanning multiple exons — issue #81) + fo <- findOverlaps(from, to, type="any", + ignore.strand=ignore.strand) + if (length(fo) > 0L) { + n_dropped <- length(unique(queryHits(fo))) + warning(n_dropped, + " variant(s) overlap coding regions but span multiple ", + "exons and could not be mapped to transcript coordinates. ", + "These are not included in predictCoding() results. ", + "Consider using locateVariants() to identify them.", + call.=FALSE) + } res <- GRanges() mcols(res) <- DataFrame(REF=DNAStringSet(), ALT=DNAStringSetList(), varAllele=DNAStringSet(), CDSLOC=IRanges(), @@ -291,6 +304,24 @@ xHits <- map$xHits txHits <- map$transcriptsHits + + ## Warn about variants that overlap CDS but weren't mapped (issue #81) + all_idx <- seq_along(from) + dropped <- setdiff(all_idx, unique(xHits)) + if (length(dropped) > 0L) { + fo <- findOverlaps(from[dropped], to, type="any", + ignore.strand=ignore.strand) + if (length(fo) > 0L) { + n_dropped <- length(unique(queryHits(fo))) + warning(n_dropped, + " variant(s) overlap coding regions but span multiple ", + "exons and could not be mapped to transcript coordinates. ", + "These are not included in predictCoding() results. ", + "Consider using locateVariants() to identify them.", + call.=FALSE) + } + } + flat_to <- unlist(to) ## names needed for mapping ## FIXME: cdsid is expensive diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R index b85e287..aed781b 100644 --- a/R/methods-locateVariants.R +++ b/R/methods-locateVariants.R @@ -545,14 +545,50 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), map <- mapToTranscripts(unname(query), subject, ignore.strand=ignore.strand) - if (length(map) > 0) { - xHits <- map$xHits - txHits <- map$transcriptsHits - tx <- names(subject)[txHits] - if (!is.null(tx)) - txid <- tx - else - txid <- NA_integer_ + + ## Rescue variants that overlap the transcript/CDS but were dropped + ## by mapToTranscripts due to the 'within' overlap requirement. + ## This handles large INDELs spanning multiple exons (issue #81). + mapped_idx <- integer(0) + if (length(map) > 0) + mapped_idx <- unique(map$xHits) + dropped <- setdiff(seq_along(query), mapped_idx) + rescued <- integer(0) + rescued_tx <- integer(0) + if (length(dropped) > 0L) { + ## Use type="any" to find variants that overlap the subject features + fo <- findOverlaps(query[dropped], subject, type="any", + ignore.strand=ignore.strand) + if (length(fo) > 0L) { + rescued <- dropped[queryHits(fo)] + rescued_tx <- subjectHits(fo) + ## De-duplicate: keep first transcript hit per query + dup <- duplicated(rescued) + rescued <- rescued[!dup] + rescued_tx <- rescued_tx[!dup] + warning(length(rescued), + " variant(s) span multiple exons/CDS and could not be ", + "mapped to transcript coordinates. These are included in ", + "results but LOCSTART/LOCEND are set to NA.", + call.=FALSE) + } + } + + if (length(map) > 0 || length(rescued) > 0L) { + ## --- Standard mapped results --- + if (length(map) > 0) { + xHits <- map$xHits + txHits <- map$transcriptsHits + tx <- names(subject)[txHits] + if (!is.null(tx)) + txid <- tx + else + txid <- NA_integer_ + } else { + xHits <- integer(0) + txHits <- integer(0) + txid <- character(0) + } ## FIXME: cdsid is expensive cdsid <- IntegerList(integer(0)) ## CodingVariants() must fall within a coding region. @@ -563,7 +599,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), ## 'map' is a GRangesList and in 'map2' it's unlisted.) ## Only ranges identified by 'map' and 'map2' are kept. ## Ranges identified by 'map' only are discarded. - if (vtype == "coding") { + if (vtype == "coding" && length(map) > 0) { usub <- unlist(subject) ## names needed for mapping map2 <- mapToTranscripts(unname(query)[xHits], usub, ignore.strand=ignore.strand) @@ -581,14 +617,17 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), } } - ss <- runValue(strand(subject)[txHits]) - if (any(elementNROWS(ss) > 1L)) { - warning("'subject' has multiple strands per list element; ", - "setting strand to '*'") - sstrand <- Rle("*", length(txHits)) - } - sstrand <- unlist(ss, use.names=FALSE) - GRanges(seqnames=seqnames(query)[xHits], + ## Build result for standard mapped variants + if (length(xHits) > 0L) { + ss <- runValue(strand(subject)[txHits]) + if (any(elementNROWS(ss) > 1L)) { + warning("'subject' has multiple strands per list element; ", + "setting strand to '*'") + sstrand <- Rle("*", length(txHits)) + } + sstrand <- unlist(ss, use.names=FALSE) + mapped_result <- GRanges( + seqnames=seqnames(query)[xHits], ranges=IRanges(ranges(query)[xHits]), strand=sstrand, LOCATION=.location(length(xHits), vtype), @@ -600,6 +639,46 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), GENEID=NA_character_, PRECEDEID=CharacterList(character(0)), FOLLOWID=CharacterList(character(0))) + } else { + mapped_result <- GRanges() + mcols(mapped_result) <- DataFrame( + LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(), + QUERYID=integer(), TXID=integer(), CDSID=IntegerList(), + GENEID=character(), PRECEDEID=CharacterList(), + FOLLOWID=CharacterList()) + } + + ## Build result for rescued multi-exon-spanning variants + if (length(rescued) > 0L) { + rtx <- names(subject)[rescued_tx] + if (is.null(rtx)) + rtx <- rep(NA_integer_, length(rescued)) + ## Use strand from the subject transcript + rss <- runValue(strand(subject)[rescued_tx]) + rsstrand <- unlist(rss, use.names=FALSE) + rescued_result <- GRanges( + seqnames=seqnames(query)[rescued], + ranges=IRanges(ranges(query)[rescued]), + strand=rsstrand, + LOCATION=.location(length(rescued), vtype), + LOCSTART=rep(NA_integer_, length(rescued)), + LOCEND=rep(NA_integer_, length(rescued)), + QUERYID=rescued, + TXID=rtx, + CDSID=IntegerList(rep(list(integer(0)), length(rescued))), + GENEID=NA_character_, + PRECEDEID=CharacterList(rep(list(character(0)), length(rescued))), + FOLLOWID=CharacterList(rep(list(character(0)), length(rescued)))) + } else { + rescued_result <- GRanges() + mcols(rescued_result) <- DataFrame( + LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(), + QUERYID=integer(), TXID=integer(), CDSID=IntegerList(), + GENEID=character(), PRECEDEID=CharacterList(), + FOLLOWID=CharacterList()) + } + + c(mapped_result, rescued_result) } else { res <- GRanges() mcols(res) <- DataFrame(LOCATION=.location(), diff --git a/inst/unitTests/test_large_indels.R b/inst/unitTests/test_large_indels.R new file mode 100644 index 0000000..5736f54 --- /dev/null +++ b/inst/unitTests/test_large_indels.R @@ -0,0 +1,103 @@ +## Tests for issue #81: locateVariants() and predictCoding() silently drop +## large INDELs that span multiple exons. + +## Create a synthetic transcript with 3 exons separated by introns: +## Exon 1: 1000-1200 (201 bp) +## Intron: 1201-1499 +## Exon 2: 1500-1700 (201 bp) +## Intron: 1701-1999 +## Exon 3: 2000-2200 (201 bp) +## +## A 2542bp deletion starting at position 1000 will span all 3 exons. + +cdsbytx <- GRangesList( + tx1 = GRanges(seqnames = "chr1", + ranges = IRanges(start = c(1000, 1500, 2000), + end = c(1200, 1700, 2200)), + strand = "+", + cds_id = c(1L, 2L, 3L), + exon_rank = c(1L, 2L, 3L)) +) + +test_locateVariants_large_indel_not_dropped <- function() +{ + ## A deletion spanning all 3 exons (width > any single exon) + ## This was previously silently dropped by mapToTranscripts. + query <- GRanges("chr1", IRanges(start = 1000, width = 2543)) + + ## locateVariants with CodingVariants should now include this variant + ## (previously it returned 0 rows) + result <- withCallingHandlers( + VariantAnnotation:::.makeResult(query, cdsbytx, "coding", + ignore.strand = TRUE, asHits = FALSE), + warning = function(w) { + ## Verify that a warning about multi-exon spanning is emitted + checkTrue(grepl("span multiple exons", conditionMessage(w))) + invokeRestart("muffleWarning") + } + ) + + ## The variant should appear in results + checkTrue(length(result) > 0L) + ## QUERYID should reference our variant (index 1) + checkTrue(1L %in% mcols(result)$QUERYID) + ## LOCATION should be "coding" + checkTrue(all(mcols(result)$LOCATION == "coding")) + ## LOCSTART/LOCEND should be NA (can't map to transcript coords) + rescued <- result[mcols(result)$QUERYID == 1L] + checkTrue(all(is.na(mcols(rescued)$LOCSTART))) + checkTrue(all(is.na(mcols(rescued)$LOCEND))) +} + +test_locateVariants_small_indel_still_works <- function() +{ + ## A small variant within a single exon should still work normally + query <- GRanges("chr1", IRanges(start = 1050, width = 10)) + result <- VariantAnnotation:::.makeResult(query, cdsbytx, "coding", + ignore.strand = TRUE, + asHits = FALSE) + ## Should find the variant + checkTrue(length(result) > 0L) + ## LOCSTART/LOCEND should NOT be NA (normal mapping works) + checkTrue(!any(is.na(mcols(result)$LOCSTART))) +} + +test_predictCoding_large_indel_warning <- function() +{ + ## A large deletion spanning multiple exons should emit a warning + ## from .localCoordinates, not silently return empty. + query <- GRanges("chr1", IRanges(start = 1000, width = 2543)) + + warned <- FALSE + result <- withCallingHandlers( + VariantAnnotation:::.localCoordinates(query, cdsbytx, + ignore.strand = TRUE), + warning = function(w) { + if (grepl("span multiple exons", conditionMessage(w))) { + warned <<- TRUE + invokeRestart("muffleWarning") + } + } + ) + + ## A warning should have been emitted + checkTrue(warned) +} + +test_locateVariants_mixed_variants <- function() +{ + ## Mix of a small variant (fits in one exon) and a large spanning deletion. + ## Both should appear in results. + query <- GRanges("chr1", + IRanges(start = c(1050, 1000), width = c(10, 2543))) + + result <- suppressWarnings( + VariantAnnotation:::.makeResult(query, cdsbytx, "coding", + ignore.strand = TRUE, asHits = FALSE) + ) + + ## Both variants should be in results + checkTrue(1L %in% mcols(result)$QUERYID) + checkTrue(2L %in% mcols(result)$QUERYID) + checkIdentical(length(unique(mcols(result)$QUERYID)), 2L) +}