From 3d1a308e1bbe3bb8e126ca78c16012959616d4d0 Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Wed, 1 Jul 2026 19:41:49 -0400 Subject: [PATCH] fix: preserve '*' spanning deletion alleles at read time (#65) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VCFs containing '*' (spanning deletion) alleles are now read with a CharacterList ALT column, preserving the '*' character. Previously, .formatALT() converted '*' to an empty string to fit into DNAStringSet, but this made it impossible to write valid VCF output — IGV/htsjdk would reject files with empty alleles. The fix treats '*' as a structural/special allele (like ''), which causes .isStructural() to return TRUE and the ALT to be stored as a CharacterList. This is semantically correct per the VCF spec: '*' represents a spanning deletion and is not a DNA sequence. For VCFs without '*' alleles, behavior is unchanged (DNAStringSetList). This supersedes the write-path-only fix in PR #103, which could only handle the multi-allele case. With this read-time fix, both single '*' and multi-allele '*' are correctly round-tripped. Closes #65 --- DESCRIPTION | 2 +- NEWS | 15 +++++++++++++++ R/AllUtilities.R | 6 ++++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9bdc489..7967a72 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Annotation of Genetic Variants Description: Annotate variants, compute amino acid coding changes, predict coding outcomes. -Version: 1.59.0 +Version: 1.59.1 Authors@R: c( person("Valerie", "Oberchain", role="aut"), person("Martin", "Morgan", role="aut"), diff --git a/NEWS b/NEWS index 6fcc401..ce2e22f 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,18 @@ +CHANGES IN VERSION 1.59.1 +------------------------- + +BUG FIXES + + o readVcf() now preserves '*' (spanning deletion) alleles instead of + converting them to empty strings. VCFs containing '*' alleles are + returned with a CharacterList ALT column (same as structural variant + VCFs), allowing faithful round-trip through writeVcf(). Previously, + '*' was erased at read time making it impossible to write valid VCF + output for files containing spanning deletions — IGV/htsjdk would + reject the output with "empty alleles are not permitted". VCFs + without '*' alleles continue to use DNAStringSetList as before. + (GitHub issue #65) + CHANGES IN VERSION 1.36.0 ------------------------- diff --git a/R/AllUtilities.R b/R/AllUtilities.R index e0478c2..bd7485b 100644 --- a/R/AllUtilities.R +++ b/R/AllUtilities.R @@ -110,7 +110,6 @@ CharacterList(x) } else { flat[grepl("I", flat, fixed=TRUE)] <- "." - flat[grepl("*", flat, fixed=TRUE)] <- "" relist(DNAStringSet(flat), x) } } @@ -118,12 +117,15 @@ ## The grep for '.' here is looking for '.' as *part* of the ALT field. ## If the ALT were '.' only, with no other characters, it would have been ## converted to an empty string in the C code before it reached this point. +## '*' is the VCF spanning deletion allele — not valid DNA, treated as +## structural so it's preserved in a CharacterList (GitHub issue #65). .isStructural <- function(x) { grepl("<", x, fixed=TRUE) | grepl("[", x, fixed=TRUE) | grepl("]", x, fixed=TRUE) | - grepl(".", x, fixed=TRUE) + grepl(".", x, fixed=TRUE) | + grepl("*", x, fixed=TRUE) } .formatInfo <- function(x, hdr, nrecords)