Bioconductor · jmg421 · Jun 30, 2026 · Jun 30, 2026 · Jun 30, 2026
diff --git a/R/AllUtilities.R b/R/AllUtilities.R
@@ -281,6 +281,19 @@
     ## 'to' is a GRangesList of cds by transcript
     map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...)
     if (length(map) == 0) {
+        ## Check if any variants overlap coding regions but couldn't be mapped
+        ## (large INDELs spanning multiple exons — issue #81)
+        fo <- findOverlaps(from, to, type="any",
+                           ignore.strand=ignore.strand)
+        if (length(fo) > 0L) {
+            n_dropped <- length(unique(queryHits(fo)))
+            warning(n_dropped,
+                    " variant(s) overlap coding regions but span multiple ",
+                    "exons and could not be mapped to transcript coordinates. ",
+                    "These are not included in predictCoding() results. ",
+                    "Consider using locateVariants() to identify them.",
+                    call.=FALSE)
+        }
         res <- GRanges()
         mcols(res) <- DataFrame(REF=DNAStringSet(), ALT=DNAStringSetList(),
                                 varAllele=DNAStringSet(), CDSLOC=IRanges(),
@@ -291,6 +304,24 @@
 
     xHits <- map$xHits
     txHits <- map$transcriptsHits
+
+    ## Warn about variants that overlap CDS but weren't mapped (issue #81)
+    all_idx <- seq_along(from)
+    dropped <- setdiff(all_idx, unique(xHits))
+    if (length(dropped) > 0L) {
+        fo <- findOverlaps(from[dropped], to, type="any",
+                           ignore.strand=ignore.strand)
+        if (length(fo) > 0L) {
+            n_dropped <- length(unique(queryHits(fo)))
+            warning(n_dropped,
+                    " variant(s) overlap coding regions but span multiple ",
+                    "exons and could not be mapped to transcript coordinates. ",
+                    "These are not included in predictCoding() results. ",
+                    "Consider using locateVariants() to identify them.",
+                    call.=FALSE)
+        }
+    }
+
     flat_to <- unlist(to) ## names needed for mapping
 
     ## FIXME: cdsid is expensive

diff --git a/R/methods-expand.R b/R/methods-expand.R
@@ -34,11 +34,9 @@ setMethod("expand", "CollapsedVCF",
         fexp$ALT <- unlist(alt(x), use.names=FALSE)
         gexp <- .expandGeno(x, hdr, elt, idx)
 
-        ## rowRanges
-        if (is.null(rd$paramRangeID)) {
-            rdexp <- rd[idx, ]
-            mcols(rdexp) <- NULL
-        } else rdexp <- rd[idx, "paramRangeID"]
+        ## rowRanges: expand to match multi-allelic expansion.
+        ## Preserve all user-added mcols columns (#85).
+        rdexp <- rd[idx, ]
 
         tmp = VCF(rdexp, colData(x), metadata(x), fexp, iexp, gexp,
             ..., collapsed=FALSE)  # https://github.com/Bioconductor/VariantAnnotation/issues/79 says 'A' should not occur in info(header())$Number

diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R
@@ -545,14 +545,50 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
 
     map <- mapToTranscripts(unname(query), subject, 
                             ignore.strand=ignore.strand)
-    if (length(map) > 0) {
-        xHits <- map$xHits
-        txHits <- map$transcriptsHits
-        tx <- names(subject)[txHits]
-        if (!is.null(tx))
-            txid <- tx
-        else
-            txid <- NA_integer_
+
+    ## Rescue variants that overlap the transcript/CDS but were dropped
+    ## by mapToTranscripts due to the 'within' overlap requirement.
+    ## This handles large INDELs spanning multiple exons (issue #81).
+    mapped_idx <- integer(0)
+    if (length(map) > 0)
+        mapped_idx <- unique(map$xHits)
+    dropped <- setdiff(seq_along(query), mapped_idx)
+    rescued <- integer(0)
+    rescued_tx <- integer(0)
+    if (length(dropped) > 0L) {
+        ## Use type="any" to find variants that overlap the subject features
+        fo <- findOverlaps(query[dropped], subject, type="any",
+                           ignore.strand=ignore.strand)
+        if (length(fo) > 0L) {
+            rescued <- dropped[queryHits(fo)]
+            rescued_tx <- subjectHits(fo)
+            ## De-duplicate: keep first transcript hit per query
+            dup <- duplicated(rescued)
+            rescued <- rescued[!dup]
+            rescued_tx <- rescued_tx[!dup]
+            warning(length(rescued),
+                    " variant(s) span multiple exons/CDS and could not be ",
+                    "mapped to transcript coordinates. These are included in ",
+                    "results but LOCSTART/LOCEND are set to NA.",
+                    call.=FALSE)
+        }
+    }
+
+    if (length(map) > 0 || length(rescued) > 0L) {
+        ## --- Standard mapped results ---
+        if (length(map) > 0) {
+            xHits <- map$xHits
+            txHits <- map$transcriptsHits
+            tx <- names(subject)[txHits]
+            if (!is.null(tx))
+                txid <- tx
+            else
+                txid <- NA_integer_
+        } else {
+            xHits <- integer(0)
+            txHits <- integer(0)
+            txid <- character(0)
+        }
         ## FIXME: cdsid is expensive
         cdsid <- IntegerList(integer(0))
         ## CodingVariants() must fall within a coding region.
@@ -563,7 +599,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
         ## 'map' is a GRangesList and in 'map2' it's unlisted.)
         ## Only ranges identified by 'map' and 'map2' are kept.
         ## Ranges identified by 'map' only are discarded.
-        if (vtype == "coding") {
+        if (vtype == "coding" && length(map) > 0) {
            usub <- unlist(subject) ## names needed for mapping
             map2 <- mapToTranscripts(unname(query)[xHits], usub,
                                      ignore.strand=ignore.strand)
@@ -581,14 +617,17 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
             }
         }
 
-        ss <- runValue(strand(subject)[txHits])
-        if (any(elementNROWS(ss) > 1L)) {
-            warning("'subject' has multiple strands per list element; ",
-                    "setting strand to '*'")
-            sstrand <- Rle("*", length(txHits))
-        }
-        sstrand <- unlist(ss, use.names=FALSE)
-        GRanges(seqnames=seqnames(query)[xHits],
+        ## Build result for standard mapped variants
+        if (length(xHits) > 0L) {
+            ss <- runValue(strand(subject)[txHits])
+            if (any(elementNROWS(ss) > 1L)) {
+                warning("'subject' has multiple strands per list element; ",
+                        "setting strand to '*'")
+                sstrand <- Rle("*", length(txHits))
+            }
+            sstrand <- unlist(ss, use.names=FALSE)
+            mapped_result <- GRanges(
+                seqnames=seqnames(query)[xHits],
                 ranges=IRanges(ranges(query)[xHits]),
                 strand=sstrand,
                 LOCATION=.location(length(xHits), vtype),
@@ -600,6 +639,46 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
                 GENEID=NA_character_,
                 PRECEDEID=CharacterList(character(0)),
                 FOLLOWID=CharacterList(character(0)))
+        } else {
+            mapped_result <- GRanges()
+            mcols(mapped_result) <- DataFrame(
+                LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(),
+                QUERYID=integer(), TXID=integer(), CDSID=IntegerList(),
+                GENEID=character(), PRECEDEID=CharacterList(),
+                FOLLOWID=CharacterList())
+        }
+
+        ## Build result for rescued multi-exon-spanning variants
+        if (length(rescued) > 0L) {
+            rtx <- names(subject)[rescued_tx]
+            if (is.null(rtx))
+                rtx <- rep(NA_integer_, length(rescued))
+            ## Use strand from the subject transcript
+            rss <- runValue(strand(subject)[rescued_tx])
+            rsstrand <- unlist(rss, use.names=FALSE)
+            rescued_result <- GRanges(
+                seqnames=seqnames(query)[rescued],
+                ranges=IRanges(ranges(query)[rescued]),
+                strand=rsstrand,
+                LOCATION=.location(length(rescued), vtype),
+                LOCSTART=rep(NA_integer_, length(rescued)),
+                LOCEND=rep(NA_integer_, length(rescued)),
+                QUERYID=rescued,
+                TXID=rtx,
+                CDSID=IntegerList(rep(list(integer(0)), length(rescued))),
+                GENEID=NA_character_,
+                PRECEDEID=CharacterList(rep(list(character(0)), length(rescued))),
+                FOLLOWID=CharacterList(rep(list(character(0)), length(rescued))))
+        } else {
+            rescued_result <- GRanges()
+            mcols(rescued_result) <- DataFrame(
+                LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(),
+                QUERYID=integer(), TXID=integer(), CDSID=IntegerList(),
+                GENEID=character(), PRECEDEID=CharacterList(),
+                FOLLOWID=CharacterList())
+        }
+
+        c(mapped_result, rescued_result)
     } else {
         res <- GRanges()
         mcols(res) <- DataFrame(LOCATION=.location(),

diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R
@@ -181,7 +181,10 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
     consequence <- rep("synonymous", length(txlocal))
     consequence[nonsynonymous] <- "nonsynonymous" 
     consequence[fmshift] <- "frameshift"
-    consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense" 
+    ## Nonsense: VARAA contains stop codon '*'.
+    ## Use grepl rather than %in% "*" to catch DBS across codon boundaries
+    ## where VARAA is multi-character (e.g. "P*" for a two-codon DBS) (#84).
+    consequence[nonsynonymous & grepl("*", as.character(varAA), fixed=TRUE)] <- "nonsense"
     consequence[zwidth | noTrans] <- "not translated" 
     consequence <- factor(consequence) 
 

diff --git a/inst/unitTests/test_expand-methods.R b/inst/unitTests/test_expand-methods.R
@@ -90,3 +90,32 @@ test_expand_adr_adf <- function()
 			})
 
 }
+
+test_expand_preserves_nonstandard_mcols <- function()
+{
+    ## Issue #85: non-standard rowRanges columns were dropped by expand()
+    vcf <- VCF(rowRanges = GRanges("chr1", IRanges(1:4*3, width=c(1, 2, 1, 1))))
+    alt(vcf) <- DNAStringSetList("A", c("TT"), c("G", "A"), c("TT", "C"))
+    ref(vcf) <- DNAStringSet(c("G", "AA", "T", "G"))
+
+    ## Add non-standard columns to rowRanges
+    mcols(rowRanges(vcf))$SNP_name <- paste0("SNP_", seq_along(vcf))
+    mcols(rowRanges(vcf))$num_alts <- elementNROWS(alt(vcf))
+
+    ## Expand (rows 3 and 4 are multi-allelic)
+    exp <- expand(vcf)
+
+    ## Should have 6 rows (4 original, +1 for row 3, +1 for row 4)
+    checkTrue(nrow(exp) == 6L)
+
+    ## Non-standard columns should be preserved
+    rr <- rowRanges(exp, fixed=FALSE)
+    checkTrue("SNP_name" %in% names(mcols(rr)))
+    checkTrue("num_alts" %in% names(mcols(rr)))
+
+    ## Values should be expanded correctly (repeated for multi-allelic)
+    checkIdentical(mcols(rr)$SNP_name,
+                   c("SNP_1", "SNP_2", "SNP_3", "SNP_3", "SNP_4", "SNP_4"))
+    checkIdentical(mcols(rr)$num_alts,
+                   c(1L, 1L, 2L, 2L, 2L, 2L))
+}
diff --git a/inst/unitTests/test_large_indels.R b/inst/unitTests/test_large_indels.R
@@ -0,0 +1,103 @@
+## Tests for issue #81: locateVariants() and predictCoding() silently drop
+## large INDELs that span multiple exons.
+
+## Create a synthetic transcript with 3 exons separated by introns:
+##   Exon 1: 1000-1200 (201 bp)
+##   Intron: 1201-1499
+##   Exon 2: 1500-1700 (201 bp)
+##   Intron: 1701-1999
+##   Exon 3: 2000-2200 (201 bp)
+##
+## A 2542bp deletion starting at position 1000 will span all 3 exons.
+
+cdsbytx <- GRangesList(
+    tx1 = GRanges(seqnames = "chr1",
+                  ranges = IRanges(start = c(1000, 1500, 2000),
+                                   end = c(1200, 1700, 2200)),
+                  strand = "+",
+                  cds_id = c(1L, 2L, 3L),
+                  exon_rank = c(1L, 2L, 3L))
+)
+
+test_locateVariants_large_indel_not_dropped <- function()
+{
+    ## A deletion spanning all 3 exons (width > any single exon)
+    ## This was previously silently dropped by mapToTranscripts.
+    query <- GRanges("chr1", IRanges(start = 1000, width = 2543))
+
+    ## locateVariants with CodingVariants should now include this variant
+    ## (previously it returned 0 rows)
+    result <- withCallingHandlers(
+        VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
+                                        ignore.strand = TRUE, asHits = FALSE),
+        warning = function(w) {
+            ## Verify that a warning about multi-exon spanning is emitted
+            checkTrue(grepl("span multiple exons", conditionMessage(w)))
+            invokeRestart("muffleWarning")
+        }
+    )
+
+    ## The variant should appear in results
+    checkTrue(length(result) > 0L)
+    ## QUERYID should reference our variant (index 1)
+    checkTrue(1L %in% mcols(result)$QUERYID)
+    ## LOCATION should be "coding"
+    checkTrue(all(mcols(result)$LOCATION == "coding"))
+    ## LOCSTART/LOCEND should be NA (can't map to transcript coords)
+    rescued <- result[mcols(result)$QUERYID == 1L]
+    checkTrue(all(is.na(mcols(rescued)$LOCSTART)))
+    checkTrue(all(is.na(mcols(rescued)$LOCEND)))
+}
+
+test_locateVariants_small_indel_still_works <- function()
+{
+    ## A small variant within a single exon should still work normally
+    query <- GRanges("chr1", IRanges(start = 1050, width = 10))
+    result <- VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
+                                              ignore.strand = TRUE,
+                                              asHits = FALSE)
+    ## Should find the variant
+    checkTrue(length(result) > 0L)
+    ## LOCSTART/LOCEND should NOT be NA (normal mapping works)
+    checkTrue(!any(is.na(mcols(result)$LOCSTART)))
+}
+
+test_predictCoding_large_indel_warning <- function()
+{
+    ## A large deletion spanning multiple exons should emit a warning
+    ## from .localCoordinates, not silently return empty.
+    query <- GRanges("chr1", IRanges(start = 1000, width = 2543))
+
+    warned <- FALSE
+    result <- withCallingHandlers(
+        VariantAnnotation:::.localCoordinates(query, cdsbytx,
+                                              ignore.strand = TRUE),
+        warning = function(w) {
+            if (grepl("span multiple exons", conditionMessage(w))) {
+                warned <<- TRUE
+                invokeRestart("muffleWarning")
+            }
+        }
+    )
+
+    ## A warning should have been emitted
+    checkTrue(warned)
+}
+
+test_locateVariants_mixed_variants <- function()
+{
+    ## Mix of a small variant (fits in one exon) and a large spanning deletion.
+    ## Both should appear in results.
+    query <- GRanges("chr1",
+                     IRanges(start = c(1050, 1000), width = c(10, 2543)))
+
+    result <- suppressWarnings(
+        VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
+                                        ignore.strand = TRUE, asHits = FALSE)
+    )
+
+    ## Both variants should be in results
+    checkTrue(1L %in% mcols(result)$QUERYID)
+    checkTrue(2L %in% mcols(result)$QUERYID)
+    checkIdentical(length(unique(mcols(result)$QUERYID)), 2L)
+}
diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R
@@ -109,3 +109,45 @@ test_predictCoding_strand <- function()
     checkIdentical(mcols(current)$CDSLOC, IRanges(3, 3))
 }
 
+
+test_predictCoding_dbs_nonsense <- function()
+{
+    ## Issue #84: DBS across codon boundary producing a stop codon (e.g. "P*")
+    ## was misclassified as "nonsynonymous" instead of "nonsense".
+    ##
+    ## Setup: tx1 has two CDS segments of 5bp each (10bp total CDS).
+    ## A 2bp substitution at positions 10004-10005 spans the codon boundary
+    ## between codon 2 (positions 10004-10006) and codon 1 (10001-10003).
+    ## We construct a case where the varAllele produces a stop codon.
+
+    ## Use a simple 1-exon CDS: positions 10001-10009 (9bp = 3 codons)
+    cdsbytx <- GRangesList(
+        tx1 = GRanges(seqnames = "chr1",
+                      ranges = IRanges(start = 10001, end = 10009),
+                      strand = "+",
+                      cds_id = 1L, exon_rank = 1L)
+    )
+
+    ## Query: a 2bp substitution at positions 10003-10004 (spans codon 1/2 boundary)
+    ## Codon 1 = pos 10001-10003, Codon 2 = pos 10004-10006
+    query <- GRanges("chr1", IRanges(start = 10003, width = 2))
+
+    ## We need a sequence source. The varAllele should create a stop in the
+    ## resulting translation. We test the consequence assignment logic directly:
+    ## If VARAA contains "*", it should be "nonsense" not "nonsynonymous".
+
+    ## Test the logic directly on the consequence vector
+    varAA <- c("P*", "*", "PQ", "*L")
+    refAA <- c("PQ", "L", "PQ", "RL")
+    nonsynonymous <- refAA != varAA
+
+    consequence <- rep("synonymous", length(varAA))
+    consequence[nonsynonymous] <- "nonsynonymous"
+    ## This is the fixed line from #84:
+    consequence[nonsynonymous & grepl("*", varAA, fixed=TRUE)] <- "nonsense"
+
+    checkIdentical(consequence[1], "nonsense")   # P* contains stop
+    checkIdentical(consequence[2], "nonsense")   # * is stop
+    checkIdentical(consequence[3], "nonsynonymous")  # PQ has no stop
+    checkIdentical(consequence[4], "nonsense")   # *L contains stop
+}