diff --git a/DESCRIPTION b/DESCRIPTION index 9bdc489..7967a72 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Annotation of Genetic Variants Description: Annotate variants, compute amino acid coding changes, predict coding outcomes. -Version: 1.59.0 +Version: 1.59.1 Authors@R: c( person("Valerie", "Oberchain", role="aut"), person("Martin", "Morgan", role="aut"), diff --git a/NEWS b/NEWS index 6fcc401..a54ba1e 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,20 @@ +CHANGES IN VERSION 1.59.1 +------------------------- + +BUG FIXES + + o locateVariants() with IntergenicVariants now correctly classifies + flanking genes into PRECEDEID and FOLLOWID based on gene strand. + Previously, classification was purely positional (genes to the right + went to PRECEDEID, genes to the left went to FOLLOWID) regardless + of the gene's transcriptional orientation. Now: + - PRECEDEID contains genes the variant is upstream of (i.e., the + variant precedes the gene's TSS in transcriptional direction) + - FOLLOWID contains genes the variant is downstream of (i.e., the + variant follows the gene's TSS in transcriptional direction) + This matches Ensembl VEP's upstream_gene_variant/downstream_gene_variant + classification. (GitHub issue #55) + CHANGES IN VERSION 1.36.0 ------------------------- diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R index b85e287..77fa59e 100644 --- a/R/methods-locateVariants.R +++ b/R/methods-locateVariants.R @@ -484,19 +484,53 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), q_range <- query[intergenic] s_genes <- rep(names(subject), elementNROWS(s_range)) s_unlist <- unlist(s_range, use.names=FALSE) + s_strand <- as.character(strand(s_unlist)) - ## ID all genes that fall in upstream / downstream range. - ## upstream == follow: + ## Find genes in upstream window (left of variant for +/* queries) f_range <- .shiftRangeUpDown(q_range, upstream(region), TRUE) - f_fo <- findOverlaps(f_range, s_unlist, ignore.strand=ignore.strand) - f_factor <- factor(queryHits(f_fo), seq_len(queryLength(f_fo))) - f_genes <- unname(splitAsList(s_genes[subjectHits(f_fo)], f_factor)) - - ## downstream == precede: + f_fo <- findOverlaps(f_range, s_unlist, ignore.strand=TRUE) + + ## Find genes in downstream window (right of variant for +/* queries) p_range <- .shiftRangeUpDown(q_range, downstream(region), FALSE) - p_fo <- findOverlaps(p_range, s_unlist, ignore.strand=ignore.strand) - p_factor <- factor(queryHits(p_fo), seq_len(queryLength(p_fo))) - p_genes <- unname(splitAsList(s_genes[subjectHits(p_fo)], p_factor)) + p_fo <- findOverlaps(p_range, s_unlist, ignore.strand=TRUE) + + ## Strand-aware classification (GitHub issue #55): + ## PRECEDEID = genes the variant is upstream of (variant precedes gene TSS) + ## - '+' strand gene to the RIGHT of variant (downstream window) + ## - '-' strand gene to the LEFT of variant (upstream window) + ## FOLLOWID = genes the variant is downstream of (variant follows gene TSS) + ## - '+' strand gene to the LEFT of variant (upstream window) + ## - '-' strand gene to the RIGHT of variant (downstream window) + + ## From upstream window: '+' genes → FOLLOWID, '-' genes → PRECEDEID + f_qhits <- queryHits(f_fo) + f_shits <- subjectHits(f_fo) + f_strands <- s_strand[f_shits] + f_gnames <- s_genes[f_shits] + + ## From downstream window: '+' genes → PRECEDEID, '-' genes → FOLLOWID + p_qhits <- queryHits(p_fo) + p_shits <- subjectHits(p_fo) + p_strands <- s_strand[p_shits] + p_gnames <- s_genes[p_shits] + + nq <- length(q_range) + + ## Build PRECEDEID: '+' from downstream + '-' from upstream + prec_qhits <- c(p_qhits[p_strands != "-"], + f_qhits[f_strands == "-"]) + prec_genes <- c(p_gnames[p_strands != "-"], + f_gnames[f_strands == "-"]) + prec_factor <- factor(prec_qhits, seq_len(nq)) + p_genes <- unname(splitAsList(prec_genes, prec_factor)) + + ## Build FOLLOWID: '+' from upstream + '-' from downstream + foll_qhits <- c(f_qhits[f_strands != "-"], + p_qhits[p_strands == "-"]) + foll_genes <- c(f_gnames[f_strands != "-"], + p_gnames[p_strands == "-"]) + foll_factor <- factor(foll_qhits, seq_len(nq)) + f_genes <- unname(splitAsList(foll_genes, foll_factor)) if (ignore.strand) strand(q_range) <- "*" diff --git a/inst/unitTests/test_locateVariants-methods.R b/inst/unitTests/test_locateVariants-methods.R index f2f181d..db7f7f3 100644 --- a/inst/unitTests/test_locateVariants-methods.R +++ b/inst/unitTests/test_locateVariants-methods.R @@ -178,3 +178,34 @@ test_locateVariants_match_predictCoding <- function() start=c(5, 77054, 77054, 77058, 77057, 77057, 77055), end=c(55, 77055, 77055, 77058, 77058, 77058, 77054)), paramRangeID=rep(NA, 7)) + +## --------------------------------------------------------------- +## Test for GitHub issue #55: PRECEDEID/FOLLOWID should be +## strand-aware for intergenic variants. +## --------------------------------------------------------------- +test_locateVariants_intergenic_strand_issue55 <- function() +{ + ## Variant at position 500. + ## Gene A: + strand, right of variant (700-1000) + ## → variant is upstream of Gene A → PRECEDEID + ## Gene B: - strand, right of variant (700-1000) + ## → variant is downstream of Gene B → FOLLOWID + ## Gene C: + strand, left of variant (100-300) + ## → variant is downstream of Gene C → FOLLOWID + ## Gene D: - strand, left of variant (100-300) + ## → variant is upstream of Gene D → PRECEDEID + query <- GRanges("chr1", IRanges(500, 500)) + subject <- GRangesList( + "GeneA" = GRanges("chr1", IRanges(700, 1000), strand="+"), + "GeneB" = GRanges("chr1", IRanges(700, 1000), strand="-"), + "GeneC" = GRanges("chr1", IRanges(100, 300), strand="+"), + "GeneD" = GRanges("chr1", IRanges(100, 300), strand="-") + ) + loc <- locateVariants(query, subject, IntergenicVariants(500, 500)) + checkTrue(length(loc) == 1L) + + prec <- sort(as.character(loc$PRECEDEID[[1]])) + foll <- sort(as.character(loc$FOLLOWID[[1]])) + checkIdentical(prec, c("GeneA", "GeneD")) + checkIdentical(foll, c("GeneB", "GeneC")) +}