Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions R/AllUtilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,19 @@
## 'to' is a GRangesList of cds by transcript
map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...)
if (length(map) == 0) {
## Check if any variants overlap coding regions but couldn't be mapped
## (large INDELs spanning multiple exons — issue #81)
fo <- findOverlaps(from, to, type="any",
ignore.strand=ignore.strand)
if (length(fo) > 0L) {
n_dropped <- length(unique(queryHits(fo)))
warning(n_dropped,
" variant(s) overlap coding regions but span multiple ",
"exons and could not be mapped to transcript coordinates. ",
"These are not included in predictCoding() results. ",
"Consider using locateVariants() to identify them.",
call.=FALSE)
}
res <- GRanges()
mcols(res) <- DataFrame(REF=DNAStringSet(), ALT=DNAStringSetList(),
varAllele=DNAStringSet(), CDSLOC=IRanges(),
Expand All @@ -291,6 +304,24 @@

xHits <- map$xHits
txHits <- map$transcriptsHits

## Warn about variants that overlap CDS but weren't mapped (issue #81)
all_idx <- seq_along(from)
dropped <- setdiff(all_idx, unique(xHits))
if (length(dropped) > 0L) {
fo <- findOverlaps(from[dropped], to, type="any",
ignore.strand=ignore.strand)
if (length(fo) > 0L) {
n_dropped <- length(unique(queryHits(fo)))
warning(n_dropped,
" variant(s) overlap coding regions but span multiple ",
"exons and could not be mapped to transcript coordinates. ",
"These are not included in predictCoding() results. ",
"Consider using locateVariants() to identify them.",
call.=FALSE)
}
}

flat_to <- unlist(to) ## names needed for mapping

## FIXME: cdsid is expensive
Expand Down
113 changes: 96 additions & 17 deletions R/methods-locateVariants.R
Original file line number Diff line number Diff line change
Expand Up @@ -545,14 +545,50 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),

map <- mapToTranscripts(unname(query), subject,
ignore.strand=ignore.strand)
if (length(map) > 0) {
xHits <- map$xHits
txHits <- map$transcriptsHits
tx <- names(subject)[txHits]
if (!is.null(tx))
txid <- tx
else
txid <- NA_integer_

## Rescue variants that overlap the transcript/CDS but were dropped
## by mapToTranscripts due to the 'within' overlap requirement.
## This handles large INDELs spanning multiple exons (issue #81).
mapped_idx <- integer(0)
if (length(map) > 0)
mapped_idx <- unique(map$xHits)
dropped <- setdiff(seq_along(query), mapped_idx)
rescued <- integer(0)
rescued_tx <- integer(0)
if (length(dropped) > 0L) {
## Use type="any" to find variants that overlap the subject features
fo <- findOverlaps(query[dropped], subject, type="any",
ignore.strand=ignore.strand)
if (length(fo) > 0L) {
rescued <- dropped[queryHits(fo)]
rescued_tx <- subjectHits(fo)
## De-duplicate: keep first transcript hit per query
dup <- duplicated(rescued)
rescued <- rescued[!dup]
rescued_tx <- rescued_tx[!dup]
warning(length(rescued),
" variant(s) span multiple exons/CDS and could not be ",
"mapped to transcript coordinates. These are included in ",
"results but LOCSTART/LOCEND are set to NA.",
call.=FALSE)
}
}

if (length(map) > 0 || length(rescued) > 0L) {
## --- Standard mapped results ---
if (length(map) > 0) {
xHits <- map$xHits
txHits <- map$transcriptsHits
tx <- names(subject)[txHits]
if (!is.null(tx))
txid <- tx
else
txid <- NA_integer_
} else {
xHits <- integer(0)
txHits <- integer(0)
txid <- character(0)
}
## FIXME: cdsid is expensive
cdsid <- IntegerList(integer(0))
## CodingVariants() must fall within a coding region.
Expand All @@ -563,7 +599,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
## 'map' is a GRangesList and in 'map2' it's unlisted.)
## Only ranges identified by 'map' and 'map2' are kept.
## Ranges identified by 'map' only are discarded.
if (vtype == "coding") {
if (vtype == "coding" && length(map) > 0) {
usub <- unlist(subject) ## names needed for mapping
map2 <- mapToTranscripts(unname(query)[xHits], usub,
ignore.strand=ignore.strand)
Expand All @@ -581,14 +617,17 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
}
}

ss <- runValue(strand(subject)[txHits])
if (any(elementNROWS(ss) > 1L)) {
warning("'subject' has multiple strands per list element; ",
"setting strand to '*'")
sstrand <- Rle("*", length(txHits))
}
sstrand <- unlist(ss, use.names=FALSE)
GRanges(seqnames=seqnames(query)[xHits],
## Build result for standard mapped variants
if (length(xHits) > 0L) {
ss <- runValue(strand(subject)[txHits])
if (any(elementNROWS(ss) > 1L)) {
warning("'subject' has multiple strands per list element; ",
"setting strand to '*'")
sstrand <- Rle("*", length(txHits))
}
sstrand <- unlist(ss, use.names=FALSE)
mapped_result <- GRanges(
seqnames=seqnames(query)[xHits],
ranges=IRanges(ranges(query)[xHits]),
strand=sstrand,
LOCATION=.location(length(xHits), vtype),
Expand All @@ -600,6 +639,46 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
GENEID=NA_character_,
PRECEDEID=CharacterList(character(0)),
FOLLOWID=CharacterList(character(0)))
} else {
mapped_result <- GRanges()
mcols(mapped_result) <- DataFrame(
LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(),
QUERYID=integer(), TXID=integer(), CDSID=IntegerList(),
GENEID=character(), PRECEDEID=CharacterList(),
FOLLOWID=CharacterList())
}

## Build result for rescued multi-exon-spanning variants
if (length(rescued) > 0L) {
rtx <- names(subject)[rescued_tx]
if (is.null(rtx))
rtx <- rep(NA_integer_, length(rescued))
## Use strand from the subject transcript
rss <- runValue(strand(subject)[rescued_tx])
rsstrand <- unlist(rss, use.names=FALSE)
rescued_result <- GRanges(
seqnames=seqnames(query)[rescued],
ranges=IRanges(ranges(query)[rescued]),
strand=rsstrand,
LOCATION=.location(length(rescued), vtype),
LOCSTART=rep(NA_integer_, length(rescued)),
LOCEND=rep(NA_integer_, length(rescued)),
QUERYID=rescued,
TXID=rtx,
CDSID=IntegerList(rep(list(integer(0)), length(rescued))),
GENEID=NA_character_,
PRECEDEID=CharacterList(rep(list(character(0)), length(rescued))),
FOLLOWID=CharacterList(rep(list(character(0)), length(rescued))))
} else {
rescued_result <- GRanges()
mcols(rescued_result) <- DataFrame(
LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(),
QUERYID=integer(), TXID=integer(), CDSID=IntegerList(),
GENEID=character(), PRECEDEID=CharacterList(),
FOLLOWID=CharacterList())
}

c(mapped_result, rescued_result)
} else {
res <- GRanges()
mcols(res) <- DataFrame(LOCATION=.location(),
Expand Down
103 changes: 103 additions & 0 deletions inst/unitTests/test_large_indels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
## Tests for issue #81: locateVariants() and predictCoding() silently drop
## large INDELs that span multiple exons.

## Create a synthetic transcript with 3 exons separated by introns:
## Exon 1: 1000-1200 (201 bp)
## Intron: 1201-1499
## Exon 2: 1500-1700 (201 bp)
## Intron: 1701-1999
## Exon 3: 2000-2200 (201 bp)
##
## A 2542bp deletion starting at position 1000 will span all 3 exons.

cdsbytx <- GRangesList(
tx1 = GRanges(seqnames = "chr1",
ranges = IRanges(start = c(1000, 1500, 2000),
end = c(1200, 1700, 2200)),
strand = "+",
cds_id = c(1L, 2L, 3L),
exon_rank = c(1L, 2L, 3L))
)

test_locateVariants_large_indel_not_dropped <- function()
{
## A deletion spanning all 3 exons (width > any single exon)
## This was previously silently dropped by mapToTranscripts.
query <- GRanges("chr1", IRanges(start = 1000, width = 2543))

## locateVariants with CodingVariants should now include this variant
## (previously it returned 0 rows)
result <- withCallingHandlers(
VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
ignore.strand = TRUE, asHits = FALSE),
warning = function(w) {
## Verify that a warning about multi-exon spanning is emitted
checkTrue(grepl("span multiple exons", conditionMessage(w)))
invokeRestart("muffleWarning")
}
)

## The variant should appear in results
checkTrue(length(result) > 0L)
## QUERYID should reference our variant (index 1)
checkTrue(1L %in% mcols(result)$QUERYID)
## LOCATION should be "coding"
checkTrue(all(mcols(result)$LOCATION == "coding"))
## LOCSTART/LOCEND should be NA (can't map to transcript coords)
rescued <- result[mcols(result)$QUERYID == 1L]
checkTrue(all(is.na(mcols(rescued)$LOCSTART)))
checkTrue(all(is.na(mcols(rescued)$LOCEND)))
}

test_locateVariants_small_indel_still_works <- function()
{
## A small variant within a single exon should still work normally
query <- GRanges("chr1", IRanges(start = 1050, width = 10))
result <- VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
ignore.strand = TRUE,
asHits = FALSE)
## Should find the variant
checkTrue(length(result) > 0L)
## LOCSTART/LOCEND should NOT be NA (normal mapping works)
checkTrue(!any(is.na(mcols(result)$LOCSTART)))
}

test_predictCoding_large_indel_warning <- function()
{
## A large deletion spanning multiple exons should emit a warning
## from .localCoordinates, not silently return empty.
query <- GRanges("chr1", IRanges(start = 1000, width = 2543))

warned <- FALSE
result <- withCallingHandlers(
VariantAnnotation:::.localCoordinates(query, cdsbytx,
ignore.strand = TRUE),
warning = function(w) {
if (grepl("span multiple exons", conditionMessage(w))) {
warned <<- TRUE
invokeRestart("muffleWarning")
}
}
)

## A warning should have been emitted
checkTrue(warned)
}

test_locateVariants_mixed_variants <- function()
{
## Mix of a small variant (fits in one exon) and a large spanning deletion.
## Both should appear in results.
query <- GRanges("chr1",
IRanges(start = c(1050, 1000), width = c(10, 2543)))

result <- suppressWarnings(
VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
ignore.strand = TRUE, asHits = FALSE)
)

## Both variants should be in results
checkTrue(1L %in% mcols(result)$QUERYID)
checkTrue(2L %in% mcols(result)$QUERYID)
checkIdentical(length(unique(mcols(result)$QUERYID)), 2L)
}