Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Type: Package
Title: Annotation of Genetic Variants
Description: Annotate variants, compute amino acid coding changes,
predict coding outcomes.
Version: 1.59.0
Version: 1.59.1
Authors@R: c(
person("Valerie", "Oberchain", role="aut"),
person("Martin", "Morgan", role="aut"),
Expand Down
17 changes: 17 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
CHANGES IN VERSION 1.59.1
-------------------------

BUG FIXES

o locateVariants() with IntergenicVariants now correctly classifies
flanking genes into PRECEDEID and FOLLOWID based on gene strand.
Previously, classification was purely positional (genes to the right
went to PRECEDEID, genes to the left went to FOLLOWID) regardless
of the gene's transcriptional orientation. Now:
- PRECEDEID contains genes the variant is upstream of (i.e., the
variant precedes the gene's TSS in transcriptional direction)
- FOLLOWID contains genes the variant is downstream of (i.e., the
variant follows the gene's TSS in transcriptional direction)
This matches Ensembl VEP's upstream_gene_variant/downstream_gene_variant
classification. (GitHub issue #55)

CHANGES IN VERSION 1.36.0
-------------------------

Expand Down
54 changes: 44 additions & 10 deletions R/methods-locateVariants.R
Original file line number Diff line number Diff line change
Expand Up @@ -484,19 +484,53 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
q_range <- query[intergenic]
s_genes <- rep(names(subject), elementNROWS(s_range))
s_unlist <- unlist(s_range, use.names=FALSE)
s_strand <- as.character(strand(s_unlist))

## ID all genes that fall in upstream / downstream range.
## upstream == follow:
## Find genes in upstream window (left of variant for +/* queries)
f_range <- .shiftRangeUpDown(q_range, upstream(region), TRUE)
f_fo <- findOverlaps(f_range, s_unlist, ignore.strand=ignore.strand)
f_factor <- factor(queryHits(f_fo), seq_len(queryLength(f_fo)))
f_genes <- unname(splitAsList(s_genes[subjectHits(f_fo)], f_factor))

## downstream == precede:
f_fo <- findOverlaps(f_range, s_unlist, ignore.strand=TRUE)

## Find genes in downstream window (right of variant for +/* queries)
p_range <- .shiftRangeUpDown(q_range, downstream(region), FALSE)
p_fo <- findOverlaps(p_range, s_unlist, ignore.strand=ignore.strand)
p_factor <- factor(queryHits(p_fo), seq_len(queryLength(p_fo)))
p_genes <- unname(splitAsList(s_genes[subjectHits(p_fo)], p_factor))
p_fo <- findOverlaps(p_range, s_unlist, ignore.strand=TRUE)

## Strand-aware classification (GitHub issue #55):
## PRECEDEID = genes the variant is upstream of (variant precedes gene TSS)
## - '+' strand gene to the RIGHT of variant (downstream window)
## - '-' strand gene to the LEFT of variant (upstream window)
## FOLLOWID = genes the variant is downstream of (variant follows gene TSS)
## - '+' strand gene to the LEFT of variant (upstream window)
## - '-' strand gene to the RIGHT of variant (downstream window)

## From upstream window: '+' genes → FOLLOWID, '-' genes → PRECEDEID
f_qhits <- queryHits(f_fo)
f_shits <- subjectHits(f_fo)
f_strands <- s_strand[f_shits]
f_gnames <- s_genes[f_shits]

## From downstream window: '+' genes → PRECEDEID, '-' genes → FOLLOWID
p_qhits <- queryHits(p_fo)
p_shits <- subjectHits(p_fo)
p_strands <- s_strand[p_shits]
p_gnames <- s_genes[p_shits]

nq <- length(q_range)

## Build PRECEDEID: '+' from downstream + '-' from upstream
prec_qhits <- c(p_qhits[p_strands != "-"],
f_qhits[f_strands == "-"])
prec_genes <- c(p_gnames[p_strands != "-"],
f_gnames[f_strands == "-"])
prec_factor <- factor(prec_qhits, seq_len(nq))
p_genes <- unname(splitAsList(prec_genes, prec_factor))

## Build FOLLOWID: '+' from upstream + '-' from downstream
foll_qhits <- c(f_qhits[f_strands != "-"],
p_qhits[p_strands == "-"])
foll_genes <- c(f_gnames[f_strands != "-"],
p_gnames[p_strands == "-"])
foll_factor <- factor(foll_qhits, seq_len(nq))
f_genes <- unname(splitAsList(foll_genes, foll_factor))

if (ignore.strand)
strand(q_range) <- "*"
Expand Down
31 changes: 31 additions & 0 deletions inst/unitTests/test_locateVariants-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,34 @@ test_locateVariants_match_predictCoding <- function()
start=c(5, 77054, 77054, 77058, 77057, 77057, 77055),
end=c(55, 77055, 77055, 77058, 77058, 77058, 77054)),
paramRangeID=rep(NA, 7))

## ---------------------------------------------------------------
## Test for GitHub issue #55: PRECEDEID/FOLLOWID should be
## strand-aware for intergenic variants.
## ---------------------------------------------------------------
test_locateVariants_intergenic_strand_issue55 <- function()
{
## Variant at position 500.
## Gene A: + strand, right of variant (700-1000)
## → variant is upstream of Gene A → PRECEDEID
## Gene B: - strand, right of variant (700-1000)
## → variant is downstream of Gene B → FOLLOWID
## Gene C: + strand, left of variant (100-300)
## → variant is downstream of Gene C → FOLLOWID
## Gene D: - strand, left of variant (100-300)
## → variant is upstream of Gene D → PRECEDEID
query <- GRanges("chr1", IRanges(500, 500))
subject <- GRangesList(
"GeneA" = GRanges("chr1", IRanges(700, 1000), strand="+"),
"GeneB" = GRanges("chr1", IRanges(700, 1000), strand="-"),
"GeneC" = GRanges("chr1", IRanges(100, 300), strand="+"),
"GeneD" = GRanges("chr1", IRanges(100, 300), strand="-")
)
loc <- locateVariants(query, subject, IntergenicVariants(500, 500))
checkTrue(length(loc) == 1L)

prec <- sort(as.character(loc$PRECEDEID[[1]]))
foll <- sort(as.character(loc$FOLLOWID[[1]]))
checkIdentical(prec, c("GeneA", "GeneD"))
checkIdentical(foll, c("GeneB", "GeneC"))
}