Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions R/AllUtilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,19 @@
## 'to' is a GRangesList of cds by transcript
map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...)
if (length(map) == 0) {
## Check if any variants overlap coding regions but couldn't be mapped
## (large INDELs spanning multiple exons — issue #81)
fo <- findOverlaps(from, to, type="any",
ignore.strand=ignore.strand)
if (length(fo) > 0L) {
n_dropped <- length(unique(queryHits(fo)))
warning(n_dropped,
" variant(s) overlap coding regions but span multiple ",
"exons and could not be mapped to transcript coordinates. ",
"These are not included in predictCoding() results. ",
"Consider using locateVariants() to identify them.",
call.=FALSE)
}
res <- GRanges()
mcols(res) <- DataFrame(REF=DNAStringSet(), ALT=DNAStringSetList(),
varAllele=DNAStringSet(), CDSLOC=IRanges(),
Expand All @@ -291,6 +304,24 @@

xHits <- map$xHits
txHits <- map$transcriptsHits

## Warn about variants that overlap CDS but weren't mapped (issue #81)
all_idx <- seq_along(from)
dropped <- setdiff(all_idx, unique(xHits))
if (length(dropped) > 0L) {
fo <- findOverlaps(from[dropped], to, type="any",
ignore.strand=ignore.strand)
if (length(fo) > 0L) {
n_dropped <- length(unique(queryHits(fo)))
warning(n_dropped,
" variant(s) overlap coding regions but span multiple ",
"exons and could not be mapped to transcript coordinates. ",
"These are not included in predictCoding() results. ",
"Consider using locateVariants() to identify them.",
call.=FALSE)
}
}

flat_to <- unlist(to) ## names needed for mapping

## FIXME: cdsid is expensive
Expand Down
8 changes: 3 additions & 5 deletions R/methods-expand.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,9 @@ setMethod("expand", "CollapsedVCF",
fexp$ALT <- unlist(alt(x), use.names=FALSE)
gexp <- .expandGeno(x, hdr, elt, idx)

## rowRanges
if (is.null(rd$paramRangeID)) {
rdexp <- rd[idx, ]
mcols(rdexp) <- NULL
} else rdexp <- rd[idx, "paramRangeID"]
## rowRanges: expand to match multi-allelic expansion.
## Preserve all user-added mcols columns (#85).
rdexp <- rd[idx, ]

tmp = VCF(rdexp, colData(x), metadata(x), fexp, iexp, gexp,
..., collapsed=FALSE) # https://github.com/Bioconductor/VariantAnnotation/issues/79 says 'A' should not occur in info(header())$Number
Expand Down
113 changes: 96 additions & 17 deletions R/methods-locateVariants.R
Original file line number Diff line number Diff line change
Expand Up @@ -545,14 +545,50 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),

map <- mapToTranscripts(unname(query), subject,
ignore.strand=ignore.strand)
if (length(map) > 0) {
xHits <- map$xHits
txHits <- map$transcriptsHits
tx <- names(subject)[txHits]
if (!is.null(tx))
txid <- tx
else
txid <- NA_integer_

## Rescue variants that overlap the transcript/CDS but were dropped
## by mapToTranscripts due to the 'within' overlap requirement.
## This handles large INDELs spanning multiple exons (issue #81).
mapped_idx <- integer(0)
if (length(map) > 0)
mapped_idx <- unique(map$xHits)
dropped <- setdiff(seq_along(query), mapped_idx)
rescued <- integer(0)
rescued_tx <- integer(0)
if (length(dropped) > 0L) {
## Use type="any" to find variants that overlap the subject features
fo <- findOverlaps(query[dropped], subject, type="any",
ignore.strand=ignore.strand)
if (length(fo) > 0L) {
rescued <- dropped[queryHits(fo)]
rescued_tx <- subjectHits(fo)
## De-duplicate: keep first transcript hit per query
dup <- duplicated(rescued)
rescued <- rescued[!dup]
rescued_tx <- rescued_tx[!dup]
warning(length(rescued),
" variant(s) span multiple exons/CDS and could not be ",
"mapped to transcript coordinates. These are included in ",
"results but LOCSTART/LOCEND are set to NA.",
call.=FALSE)
}
}

if (length(map) > 0 || length(rescued) > 0L) {
## --- Standard mapped results ---
if (length(map) > 0) {
xHits <- map$xHits
txHits <- map$transcriptsHits
tx <- names(subject)[txHits]
if (!is.null(tx))
txid <- tx
else
txid <- NA_integer_
} else {
xHits <- integer(0)
txHits <- integer(0)
txid <- character(0)
}
## FIXME: cdsid is expensive
cdsid <- IntegerList(integer(0))
## CodingVariants() must fall within a coding region.
Expand All @@ -563,7 +599,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
## 'map' is a GRangesList and in 'map2' it's unlisted.)
## Only ranges identified by 'map' and 'map2' are kept.
## Ranges identified by 'map' only are discarded.
if (vtype == "coding") {
if (vtype == "coding" && length(map) > 0) {
usub <- unlist(subject) ## names needed for mapping
map2 <- mapToTranscripts(unname(query)[xHits], usub,
ignore.strand=ignore.strand)
Expand All @@ -581,14 +617,17 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
}
}

ss <- runValue(strand(subject)[txHits])
if (any(elementNROWS(ss) > 1L)) {
warning("'subject' has multiple strands per list element; ",
"setting strand to '*'")
sstrand <- Rle("*", length(txHits))
}
sstrand <- unlist(ss, use.names=FALSE)
GRanges(seqnames=seqnames(query)[xHits],
## Build result for standard mapped variants
if (length(xHits) > 0L) {
ss <- runValue(strand(subject)[txHits])
if (any(elementNROWS(ss) > 1L)) {
warning("'subject' has multiple strands per list element; ",
"setting strand to '*'")
sstrand <- Rle("*", length(txHits))
}
sstrand <- unlist(ss, use.names=FALSE)
mapped_result <- GRanges(
seqnames=seqnames(query)[xHits],
ranges=IRanges(ranges(query)[xHits]),
strand=sstrand,
LOCATION=.location(length(xHits), vtype),
Expand All @@ -600,6 +639,46 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
GENEID=NA_character_,
PRECEDEID=CharacterList(character(0)),
FOLLOWID=CharacterList(character(0)))
} else {
mapped_result <- GRanges()
mcols(mapped_result) <- DataFrame(
LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(),
QUERYID=integer(), TXID=integer(), CDSID=IntegerList(),
GENEID=character(), PRECEDEID=CharacterList(),
FOLLOWID=CharacterList())
}

## Build result for rescued multi-exon-spanning variants
if (length(rescued) > 0L) {
rtx <- names(subject)[rescued_tx]
if (is.null(rtx))
rtx <- rep(NA_integer_, length(rescued))
## Use strand from the subject transcript
rss <- runValue(strand(subject)[rescued_tx])
rsstrand <- unlist(rss, use.names=FALSE)
rescued_result <- GRanges(
seqnames=seqnames(query)[rescued],
ranges=IRanges(ranges(query)[rescued]),
strand=rsstrand,
LOCATION=.location(length(rescued), vtype),
LOCSTART=rep(NA_integer_, length(rescued)),
LOCEND=rep(NA_integer_, length(rescued)),
QUERYID=rescued,
TXID=rtx,
CDSID=IntegerList(rep(list(integer(0)), length(rescued))),
GENEID=NA_character_,
PRECEDEID=CharacterList(rep(list(character(0)), length(rescued))),
FOLLOWID=CharacterList(rep(list(character(0)), length(rescued))))
} else {
rescued_result <- GRanges()
mcols(rescued_result) <- DataFrame(
LOCATION=.location(), LOCSTART=integer(), LOCEND=integer(),
QUERYID=integer(), TXID=integer(), CDSID=IntegerList(),
GENEID=character(), PRECEDEID=CharacterList(),
FOLLOWID=CharacterList())
}

c(mapped_result, rescued_result)
} else {
res <- GRanges()
mcols(res) <- DataFrame(LOCATION=.location(),
Expand Down
5 changes: 4 additions & 1 deletion R/methods-predictCoding.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
consequence <- rep("synonymous", length(txlocal))
consequence[nonsynonymous] <- "nonsynonymous"
consequence[fmshift] <- "frameshift"
consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense"
## Nonsense: VARAA contains stop codon '*'.
## Use grepl rather than %in% "*" to catch DBS across codon boundaries
## where VARAA is multi-character (e.g. "P*" for a two-codon DBS) (#84).
consequence[nonsynonymous & grepl("*", as.character(varAA), fixed=TRUE)] <- "nonsense"
consequence[zwidth | noTrans] <- "not translated"
consequence <- factor(consequence)

Expand Down
29 changes: 29 additions & 0 deletions inst/unitTests/test_expand-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,32 @@ test_expand_adr_adf <- function()
})

}

test_expand_preserves_nonstandard_mcols <- function()
{
## Issue #85: non-standard rowRanges columns were dropped by expand()
vcf <- VCF(rowRanges = GRanges("chr1", IRanges(1:4*3, width=c(1, 2, 1, 1))))
alt(vcf) <- DNAStringSetList("A", c("TT"), c("G", "A"), c("TT", "C"))
ref(vcf) <- DNAStringSet(c("G", "AA", "T", "G"))

## Add non-standard columns to rowRanges
mcols(rowRanges(vcf))$SNP_name <- paste0("SNP_", seq_along(vcf))
mcols(rowRanges(vcf))$num_alts <- elementNROWS(alt(vcf))

## Expand (rows 3 and 4 are multi-allelic)
exp <- expand(vcf)

## Should have 6 rows (4 original, +1 for row 3, +1 for row 4)
checkTrue(nrow(exp) == 6L)

## Non-standard columns should be preserved
rr <- rowRanges(exp, fixed=FALSE)
checkTrue("SNP_name" %in% names(mcols(rr)))
checkTrue("num_alts" %in% names(mcols(rr)))

## Values should be expanded correctly (repeated for multi-allelic)
checkIdentical(mcols(rr)$SNP_name,
c("SNP_1", "SNP_2", "SNP_3", "SNP_3", "SNP_4", "SNP_4"))
checkIdentical(mcols(rr)$num_alts,
c(1L, 1L, 2L, 2L, 2L, 2L))
}
103 changes: 103 additions & 0 deletions inst/unitTests/test_large_indels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
## Tests for issue #81: locateVariants() and predictCoding() silently drop
## large INDELs that span multiple exons.

## Create a synthetic transcript with 3 exons separated by introns:
## Exon 1: 1000-1200 (201 bp)
## Intron: 1201-1499
## Exon 2: 1500-1700 (201 bp)
## Intron: 1701-1999
## Exon 3: 2000-2200 (201 bp)
##
## A 2542bp deletion starting at position 1000 will span all 3 exons.

cdsbytx <- GRangesList(
tx1 = GRanges(seqnames = "chr1",
ranges = IRanges(start = c(1000, 1500, 2000),
end = c(1200, 1700, 2200)),
strand = "+",
cds_id = c(1L, 2L, 3L),
exon_rank = c(1L, 2L, 3L))
)

test_locateVariants_large_indel_not_dropped <- function()
{
## A deletion spanning all 3 exons (width > any single exon)
## This was previously silently dropped by mapToTranscripts.
query <- GRanges("chr1", IRanges(start = 1000, width = 2543))

## locateVariants with CodingVariants should now include this variant
## (previously it returned 0 rows)
result <- withCallingHandlers(
VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
ignore.strand = TRUE, asHits = FALSE),
warning = function(w) {
## Verify that a warning about multi-exon spanning is emitted
checkTrue(grepl("span multiple exons", conditionMessage(w)))
invokeRestart("muffleWarning")
}
)

## The variant should appear in results
checkTrue(length(result) > 0L)
## QUERYID should reference our variant (index 1)
checkTrue(1L %in% mcols(result)$QUERYID)
## LOCATION should be "coding"
checkTrue(all(mcols(result)$LOCATION == "coding"))
## LOCSTART/LOCEND should be NA (can't map to transcript coords)
rescued <- result[mcols(result)$QUERYID == 1L]
checkTrue(all(is.na(mcols(rescued)$LOCSTART)))
checkTrue(all(is.na(mcols(rescued)$LOCEND)))
}

test_locateVariants_small_indel_still_works <- function()
{
## A small variant within a single exon should still work normally
query <- GRanges("chr1", IRanges(start = 1050, width = 10))
result <- VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
ignore.strand = TRUE,
asHits = FALSE)
## Should find the variant
checkTrue(length(result) > 0L)
## LOCSTART/LOCEND should NOT be NA (normal mapping works)
checkTrue(!any(is.na(mcols(result)$LOCSTART)))
}

test_predictCoding_large_indel_warning <- function()
{
## A large deletion spanning multiple exons should emit a warning
## from .localCoordinates, not silently return empty.
query <- GRanges("chr1", IRanges(start = 1000, width = 2543))

warned <- FALSE
result <- withCallingHandlers(
VariantAnnotation:::.localCoordinates(query, cdsbytx,
ignore.strand = TRUE),
warning = function(w) {
if (grepl("span multiple exons", conditionMessage(w))) {
warned <<- TRUE
invokeRestart("muffleWarning")
}
}
)

## A warning should have been emitted
checkTrue(warned)
}

test_locateVariants_mixed_variants <- function()
{
## Mix of a small variant (fits in one exon) and a large spanning deletion.
## Both should appear in results.
query <- GRanges("chr1",
IRanges(start = c(1050, 1000), width = c(10, 2543)))

result <- suppressWarnings(
VariantAnnotation:::.makeResult(query, cdsbytx, "coding",
ignore.strand = TRUE, asHits = FALSE)
)

## Both variants should be in results
checkTrue(1L %in% mcols(result)$QUERYID)
checkTrue(2L %in% mcols(result)$QUERYID)
checkIdentical(length(unique(mcols(result)$QUERYID)), 2L)
}
42 changes: 42 additions & 0 deletions inst/unitTests/test_predictCoding-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,45 @@ test_predictCoding_strand <- function()
checkIdentical(mcols(current)$CDSLOC, IRanges(3, 3))
}


test_predictCoding_dbs_nonsense <- function()
{
## Issue #84: DBS across codon boundary producing a stop codon (e.g. "P*")
## was misclassified as "nonsynonymous" instead of "nonsense".
##
## Setup: tx1 has two CDS segments of 5bp each (10bp total CDS).
## A 2bp substitution at positions 10004-10005 spans the codon boundary
## between codon 2 (positions 10004-10006) and codon 1 (10001-10003).
## We construct a case where the varAllele produces a stop codon.

## Use a simple 1-exon CDS: positions 10001-10009 (9bp = 3 codons)
cdsbytx <- GRangesList(
tx1 = GRanges(seqnames = "chr1",
ranges = IRanges(start = 10001, end = 10009),
strand = "+",
cds_id = 1L, exon_rank = 1L)
)

## Query: a 2bp substitution at positions 10003-10004 (spans codon 1/2 boundary)
## Codon 1 = pos 10001-10003, Codon 2 = pos 10004-10006
query <- GRanges("chr1", IRanges(start = 10003, width = 2))

## We need a sequence source. The varAllele should create a stop in the
## resulting translation. We test the consequence assignment logic directly:
## If VARAA contains "*", it should be "nonsense" not "nonsynonymous".

## Test the logic directly on the consequence vector
varAA <- c("P*", "*", "PQ", "*L")
refAA <- c("PQ", "L", "PQ", "RL")
nonsynonymous <- refAA != varAA

consequence <- rep("synonymous", length(varAA))
consequence[nonsynonymous] <- "nonsynonymous"
## This is the fixed line from #84:
consequence[nonsynonymous & grepl("*", varAA, fixed=TRUE)] <- "nonsense"

checkIdentical(consequence[1], "nonsense") # P* contains stop
checkIdentical(consequence[2], "nonsense") # * is stop
checkIdentical(consequence[3], "nonsynonymous") # PQ has no stop
checkIdentical(consequence[4], "nonsense") # *L contains stop
}