diff --git a/DESCRIPTION b/DESCRIPTION index 5dff4ca..5bca7b7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: natcpp Title: Fast C++ Primitives for the 'NeuroAnatomy Toolbox' -Version: 0.2 +Version: 0.3.0 Authors@R: person(given = "Gregory", family = "Jefferis", @@ -18,14 +18,17 @@ Description: Fast functions implemented in C++ via 'Rcpp' to support the License: GPL (>= 3) URL: https://github.com/natverse/natcpp, https://natverse.org/natcpp/ BugReports: https://github.com/natverse/natcpp/issues -Imports: +Imports: + Matrix, + methods, Rcpp (>= 1.0.6) -Suggests: +Suggests: spelling, testthat (>= 3.0.0) -LinkingTo: - Rcpp +LinkingTo: + Rcpp, + RcppThread Config/testthat/edition: 3 Encoding: UTF-8 Language: en-GB -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 diff --git a/NAMESPACE b/NAMESPACE index 504bfff..b0708b5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,5 +10,8 @@ export(c_sub2ind) export(c_topntail) export(c_topntail_list) export(c_total_cable) +export(c_weighted_jaccard_dense) +export(c_weighted_jaccard_sparse) import(Rcpp) +importFrom(methods,as) useDynLib(natcpp, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index c90d42d..c113355 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -135,3 +135,32 @@ c_EdgeListFromSegList <- function(L) { .Call(`_natcpp_c_EdgeListFromSegList`, L) } +#' @noRd +weighted_jaccard_sparse_fill <- function(x, pattern, transpose = FALSE, display_progress = TRUE, threads = 4L) { + .Call(`_natcpp_weighted_jaccard_sparse_fill`, x, pattern, transpose, display_progress, threads) +} + +#' Dense weighted Jaccard similarity via C++ +#' +#' Compute the full weighted Jaccard similarity matrix for a +#' \link[Matrix:dgCMatrix-class]{dgCMatrix} using +#' an adaptive dense accumulation strategy. For small output matrices, uses a +#' feature-oriented loop; for large outputs, switches to a column-oriented +#' loop for better cache performance. +#' +#' @param x A \link[Matrix:dgCMatrix-class]{dgCMatrix} (sparse column-compressed matrix) +#' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare +#' rows +#' @param threads Number of threads (default 4). Set to 0 for all cores. +#' @param triangle If \code{TRUE}, return only the lower triangle as a flat +#' numeric vector in \code{\link{dist}} layout. If \code{FALSE} (default), +#' return a full square matrix. +#' @param distance If \code{TRUE}, return distance (\code{1 - similarity}) +#' instead of similarity. Default \code{FALSE}. +#' @return A dense numeric similarity matrix, or a numeric vector in +#' \code{\link{dist}} layout when \code{triangle = TRUE}. +#' @export +c_weighted_jaccard_dense <- function(x, transpose = FALSE, threads = 4L, triangle = FALSE, distance = FALSE) { + .Call(`_natcpp_c_weighted_jaccard_dense`, x, transpose, threads, triangle, distance) +} + diff --git a/R/weighted_jaccard.R b/R/weighted_jaccard.R new file mode 100644 index 0000000..870a61e --- /dev/null +++ b/R/weighted_jaccard.R @@ -0,0 +1,87 @@ +#' Sparse weighted Jaccard similarity via C++ +#' +#' Compute the weighted Jaccard similarity matrix for a +#' \link[Matrix:dgCMatrix-class]{dgCMatrix}, returning a +#' sparse result. Uses \code{weighted_jaccard_sparse_fill} to compute min-sums +#' only for column (or row) pairs that share at least one non-zero feature, then +#' normalises to similarity. Only the upper triangle is computed, taking +#' advantage of the symmetry of the Jaccard index. +#' +#' @param x A \link[Matrix:dgCMatrix-class]{dgCMatrix} (sparse column-compressed matrix) +#' @param transpose If \code{FALSE} (default), compare columns; if +#' \code{TRUE}, compare rows. +#' @param display_progress Whether to show a text progress bar (default +#' \code{TRUE}). +#' @param threads Number of threads for parallel computation (default 4). +#' Set to 0 to use all available cores. +#' @param triangle If \code{TRUE}, return a symmetric \code{dsCMatrix} +#' (upper triangle only). If \code{FALSE} (default), return a general +#' \link[Matrix:dgCMatrix-class]{dgCMatrix}. +#' @param distance If \code{TRUE}, return distance (\code{1 - similarity}) +#' instead of similarity. Default \code{FALSE}. A warning is issued since +#' sparse distance matrices are typically dense. +#' @return A sparse similarity (or distance) matrix: \code{dsCMatrix} when +#' \code{triangle = TRUE}, \link[Matrix:dgCMatrix-class]{dgCMatrix} otherwise. +#' @importFrom methods as +#' @export +#' @seealso \code{\link{c_weighted_jaccard_dense}} for the dense equivalent +#' @examples +#' \dontrun{ +#' library(Matrix) +#' m <- sparseMatrix(i = c(1,2,1,2,3,3), j = c(1,1,2,2,2,3), +#' x = c(4,2,1,3,3,1), dims = c(3,3)) +#' c_weighted_jaccard_sparse(m) +#' } +c_weighted_jaccard_sparse <- function(x, transpose = FALSE, display_progress = TRUE, + threads = 4L, triangle = FALSE, + distance = FALSE) { + if (distance) + warning("distance=TRUE with sparse output produces a mostly-dense matrix; ", + "consider using c_weighted_jaccard_dense() with triangle=TRUE instead.") + + crossfun <- if (transpose) Matrix::tcrossprod else Matrix::crossprod + n <- if (transpose) nrow(x) else ncol(x) + + if (length(x@x) == 0L) { + sim_val <- if (distance) 0 else 1 + m <- Matrix::sparseMatrix(i = seq_len(n), j = seq_len(n), x = sim_val, + dims = c(n, n), symmetric = triangle) + if (!triangle) + m <- as(m, "generalMatrix") + return(m) + } + + # Sparsity pattern from binarised crossprod — dsCMatrix (upper triangle) + b <- x + b@x <- rep(1, length(b@x)) + A <- crossfun(b) + + # Column/row totals for denominator + totals <- if (transpose) Matrix::rowSums(x) else Matrix::colSums(x) + + # Fill in min_sums using C++ (only upper triangle entries) + A@x <- weighted_jaccard_sparse_fill(x, A, transpose = transpose, + display_progress = display_progress, + threads = threads) + + # Convert min_sums to similarity: sim = ms / (s_i + s_j - ms) + col_idx <- rep(seq_along(diff(A@p)), diff(A@p)) + row_idx <- A@i + 1L + denom <- totals[row_idx] + totals[col_idx] - A@x + nonzero <- denom != 0 + A@x[nonzero] <- A@x[nonzero] / denom[nonzero] + A@x[!nonzero] <- 0 + + # Set diagonal to 1 for similarity + diag_pos <- which(A@i + 1L == col_idx) + A@x[diag_pos] <- 1 + + if (distance) { + A@x <- 1 - A@x # diagonal 1→0, off-diag sim→1-sim + } + + if (!triangle) + A <- as(A, "generalMatrix") + + A +} diff --git a/inst/WORDLIST b/inst/WORDLIST index 93c16a1..29595b8 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,4 +1,6 @@ CMD +Codecov +Jaccard Lifecycle NeuroAnatomy Nx diff --git a/man/c_weighted_jaccard_dense.Rd b/man/c_weighted_jaccard_dense.Rd new file mode 100644 index 0000000..e1b1f9c --- /dev/null +++ b/man/c_weighted_jaccard_dense.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{c_weighted_jaccard_dense} +\alias{c_weighted_jaccard_dense} +\title{Dense weighted Jaccard similarity via C++} +\usage{ +c_weighted_jaccard_dense( + x, + transpose = FALSE, + threads = 4L, + triangle = FALSE, + distance = FALSE +) +} +\arguments{ +\item{x}{A \link[Matrix:dgCMatrix-class]{dgCMatrix} (sparse column-compressed matrix)} + +\item{transpose}{If \code{FALSE}, compare columns; if \code{TRUE}, compare +rows} + +\item{threads}{Number of threads (default 4). Set to 0 for all cores.} + +\item{triangle}{If \code{TRUE}, return only the lower triangle as a flat +numeric vector in \code{\link{dist}} layout. If \code{FALSE} (default), +return a full square matrix.} + +\item{distance}{If \code{TRUE}, return distance (\code{1 - similarity}) +instead of similarity. Default \code{FALSE}.} +} +\value{ +A dense numeric similarity matrix, or a numeric vector in + \code{\link{dist}} layout when \code{triangle = TRUE}. +} +\description{ +Compute the full weighted Jaccard similarity matrix for a +\link[Matrix:dgCMatrix-class]{dgCMatrix} using +an adaptive dense accumulation strategy. For small output matrices, uses a +feature-oriented loop; for large outputs, switches to a column-oriented +loop for better cache performance. +} diff --git a/man/c_weighted_jaccard_sparse.Rd b/man/c_weighted_jaccard_sparse.Rd new file mode 100644 index 0000000..cadadda --- /dev/null +++ b/man/c_weighted_jaccard_sparse.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/weighted_jaccard.R +\name{c_weighted_jaccard_sparse} +\alias{c_weighted_jaccard_sparse} +\title{Sparse weighted Jaccard similarity via C++} +\usage{ +c_weighted_jaccard_sparse( + x, + transpose = FALSE, + display_progress = TRUE, + threads = 4L, + triangle = FALSE, + distance = FALSE +) +} +\arguments{ +\item{x}{A \link[Matrix:dgCMatrix-class]{dgCMatrix} (sparse column-compressed matrix)} + +\item{transpose}{If \code{FALSE} (default), compare columns; if +\code{TRUE}, compare rows.} + +\item{display_progress}{Whether to show a text progress bar (default +\code{TRUE}).} + +\item{threads}{Number of threads for parallel computation (default 4). +Set to 0 to use all available cores.} + +\item{triangle}{If \code{TRUE}, return a symmetric \code{dsCMatrix} +(upper triangle only). If \code{FALSE} (default), return a general +\link[Matrix:dgCMatrix-class]{dgCMatrix}.} + +\item{distance}{If \code{TRUE}, return distance (\code{1 - similarity}) +instead of similarity. Default \code{FALSE}. A warning is issued since +sparse distance matrices are typically dense.} +} +\value{ +A sparse similarity (or distance) matrix: \code{dsCMatrix} when + \code{triangle = TRUE}, \link[Matrix:dgCMatrix-class]{dgCMatrix} otherwise. +} +\description{ +Compute the weighted Jaccard similarity matrix for a +\link[Matrix:dgCMatrix-class]{dgCMatrix}, returning a +sparse result. Uses \code{weighted_jaccard_sparse_fill} to compute min-sums +only for column (or row) pairs that share at least one non-zero feature, then +normalises to similarity. Only the upper triangle is computed, taking +advantage of the symmetry of the Jaccard index. +} +\examples{ +\dontrun{ +library(Matrix) +m <- sparseMatrix(i = c(1,2,1,2,3,3), j = c(1,1,2,2,2,3), + x = c(4,2,1,3,3,1), dims = c(3,3)) +c_weighted_jaccard_sparse(m) +} +} +\seealso{ +\code{\link{c_weighted_jaccard_dense}} for the dense equivalent +} diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..4c6d109 --- /dev/null +++ b/src/Makevars @@ -0,0 +1 @@ +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 3415f85..e6ff2f2 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,6 +1,7 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#include #include using namespace Rcpp; @@ -135,6 +136,36 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// weighted_jaccard_sparse_fill +NumericVector weighted_jaccard_sparse_fill(const S4& x, const S4& pattern, bool transpose, bool display_progress, int threads); +RcppExport SEXP _natcpp_weighted_jaccard_sparse_fill(SEXP xSEXP, SEXP patternSEXP, SEXP transposeSEXP, SEXP display_progressSEXP, SEXP threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const S4& >::type x(xSEXP); + Rcpp::traits::input_parameter< const S4& >::type pattern(patternSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); + rcpp_result_gen = Rcpp::wrap(weighted_jaccard_sparse_fill(x, pattern, transpose, display_progress, threads)); + return rcpp_result_gen; +END_RCPP +} +// c_weighted_jaccard_dense +SEXP c_weighted_jaccard_dense(const S4& x, bool transpose, int threads, bool triangle, bool distance); +RcppExport SEXP _natcpp_c_weighted_jaccard_dense(SEXP xSEXP, SEXP transposeSEXP, SEXP threadsSEXP, SEXP triangleSEXP, SEXP distanceSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const S4& >::type x(xSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); + Rcpp::traits::input_parameter< bool >::type triangle(triangleSEXP); + Rcpp::traits::input_parameter< bool >::type distance(distanceSEXP); + rcpp_result_gen = Rcpp::wrap(c_weighted_jaccard_dense(x, transpose, threads, triangle, distance)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_natcpp_c_seglengths", (DL_FUNC) &_natcpp_c_seglengths, 4}, @@ -147,6 +178,8 @@ static const R_CallMethodDef CallEntries[] = { {"_natcpp_c_topntail", (DL_FUNC) &_natcpp_c_topntail, 1}, {"_natcpp_c_topntail_list", (DL_FUNC) &_natcpp_c_topntail_list, 1}, {"_natcpp_c_EdgeListFromSegList", (DL_FUNC) &_natcpp_c_EdgeListFromSegList, 1}, + {"_natcpp_weighted_jaccard_sparse_fill", (DL_FUNC) &_natcpp_weighted_jaccard_sparse_fill, 5}, + {"_natcpp_c_weighted_jaccard_dense", (DL_FUNC) &_natcpp_c_weighted_jaccard_dense, 5}, {NULL, NULL, 0} }; diff --git a/src/weighted_jaccard.cpp b/src/weighted_jaccard.cpp new file mode 100644 index 0000000..396b542 --- /dev/null +++ b/src/weighted_jaccard.cpp @@ -0,0 +1,481 @@ +#include +#include +#include +#include +#include +#include + +using namespace Rcpp; + +// Progress bar that suppresses output for the first `delaySecs` seconds, +// then shows a bar with time estimate (using RcppThread's Rcout). +class DelayedProgressBar : public RcppThread::ProgressPrinter { +public: + DelayedProgressBar(size_t numIt, size_t printEvery, size_t delaySecs, + bool display = true) + : ProgressPrinter(numIt, printEvery), delaySecs_(delaySecs), + display_(display) {} + +private: + size_t delaySecs_; + bool display_; + bool started_{false}; + + void printProgress() override { + if (!display_ || isDone_) return; + if (it_ == numIt_) isDone_ = true; + + using namespace std::chrono; + auto elapsed = duration(steady_clock::now() - startTime_).count(); + if (!started_ && elapsed < delaySecs_) return; + started_ = true; + + double pct = std::round(it_ * 100.0 / numIt_); + std::ostringstream msg; + msg << "\rComputing: " << makeBar(pct) << progressString(); + RcppThread::Rcout << msg.str(); + } + + std::string makeBar(size_t pct, size_t numBars = 40) { + std::ostringstream msg; + msg << "["; + size_t i = 0; + for (; i < pct / 100.0 * numBars; i++) msg << "="; + for (; i < numBars; i++) msg << " "; + msg << "] "; + return msg.str(); + } +}; + +//' @noRd +// [[Rcpp::export]] +NumericVector weighted_jaccard_sparse_fill( + const S4& x, const S4& pattern, bool transpose = false, + bool display_progress = true, int threads = 4) { + + // Original matrix slots (dgCMatrix: nr x nc) + IntegerVector dims = x.slot("Dim"); + IntegerVector xi = x.slot("i"); + IntegerVector xp = x.slot("p"); + NumericVector xx = x.slot("x"); + + const int nr = dims[0]; + const int nc = dims[1]; + const int nfeat = transpose ? nc : nr; + + // Pattern matrix slots (dgCMatrix: ncomp x ncomp) + IntegerVector Ai = pattern.slot("i"); + IntegerVector Ap = pattern.slot("p"); + IntegerVector Adims = pattern.slot("Dim"); + const int ncomp = Adims[0]; + const int annz = Ai.size(); + + // Output: min_sums values aligned with pattern's (i, p) structure + NumericVector out(annz, 0.0); + + // Build CSR for feature dimension: for each feature f, the items (columns + // or rows being compared) that are nonzero in that feature, with their values. + std::vector feat_count(nfeat, 0); + if (!transpose) { + for (int col = 0; col < nc; ++col) + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) + feat_count[xi[idx]]++; + } else { + for (int col = 0; col < nc; ++col) + feat_count[col] = xp[col + 1] - xp[col]; + } + + std::vector feat_offsets(nfeat + 1, 0); + for (int f = 0; f < nfeat; ++f) + feat_offsets[f + 1] = feat_offsets[f] + feat_count[f]; + + const int total_nnz = feat_offsets[nfeat]; + std::vector feat_items(total_nnz); // which item (col/row being compared) + std::vector feat_vals(total_nnz); // value of x at that position + std::vector feat_fill(feat_offsets.begin(), feat_offsets.end()); + + if (!transpose) { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int row = xi[idx]; + const int dest = feat_fill[row]++; + feat_items[dest] = col; + feat_vals[dest] = xx[idx]; + } + } + } else { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int dest = feat_fill[col]++; + feat_items[dest] = xi[idx]; + feat_vals[dest] = xx[idx]; + } + } + } + + // Also need item->features CSR: for each compared item, which features + // it participates in and with what value. + std::vector item_count(ncomp, 0); + for (int f = 0; f < nfeat; ++f) { + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) + item_count[feat_items[idx]]++; + } + + std::vector item_offsets(ncomp + 1, 0); + for (int c = 0; c < ncomp; ++c) + item_offsets[c + 1] = item_offsets[c] + item_count[c]; + + std::vector item_feats(total_nnz); // which feature + std::vector item_vals(total_nnz); // value + std::vector item_fill(item_offsets.begin(), item_offsets.end()); + + for (int f = 0; f < nfeat; ++f) { + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) { + const int item = feat_items[idx]; + const int dest = item_fill[item]++; + item_feats[dest] = f; + item_vals[dest] = feat_vals[idx]; + } + } + + // Raw pointer for thread-safe access (Rcpp NumericVector not thread-safe) + double* out_ptr = out.begin(); + const int* Ai_ptr = &Ai[0]; + const int* Ap_ptr = &Ap[0]; + const int* item_feats_ptr = item_feats.data(); + const double* item_vals_ptr = item_vals.data(); + const int* feat_items_ptr = feat_items.data(); + const double* feat_vals_ptr = feat_vals.data(); + const int* feat_offsets_ptr = feat_offsets.data(); + const int* item_offsets_ptr = item_offsets.data(); + + DelayedProgressBar bar(ncomp, 1, 2, display_progress); + + const size_t nThreads = (threads > 0) ? static_cast(threads) + : std::thread::hardware_concurrency(); + + // Pre-allocate per-thread row_to_pos vectors (freed when function exits) + std::vector> per_thread_r2p(nThreads, std::vector(ncomp, -1)); + + // Thread-local index into per_thread_r2p, assigned once per thread via atomic + static thread_local int tl_tid = -1; + static thread_local int tl_gen = -1; + static std::atomic generation{0}; + const int cur_gen = ++generation; + std::atomic next_tid{0}; + + // Parallel over output columns — each column writes to its own slice of out[] + RcppThread::parallelFor(0, ncomp, [&](int cb) { + if (tl_gen != cur_gen) { + tl_tid = next_tid++; + tl_gen = cur_gen; + } + auto& row_to_pos = per_thread_r2p[tl_tid]; + + // Populate row_to_pos for pattern column cb + for (int idx = Ap_ptr[cb]; idx < Ap_ptr[cb + 1]; ++idx) + row_to_pos[Ai_ptr[idx]] = idx; + + // For each feature f where cb is nonzero + for (int fi = item_offsets_ptr[cb]; fi < item_offsets_ptr[cb + 1]; ++fi) { + const int f = item_feats_ptr[fi]; + const double vb = item_vals_ptr[fi]; + + // For each other item ca also in this feature, accumulate min + for (int idx = feat_offsets_ptr[f]; idx < feat_offsets_ptr[f + 1]; ++idx) { + const int ca = feat_items_ptr[idx]; + const int pos = row_to_pos[ca]; + if (pos >= 0) + out_ptr[pos] += std::min(feat_vals_ptr[idx], vb); + } + } + + // Clear row_to_pos + for (int idx = Ap_ptr[cb]; idx < Ap_ptr[cb + 1]; ++idx) + row_to_pos[Ai_ptr[idx]] = -1; + + ++bar; + }, nThreads); + + return out; +} + +// Helper: build feature CSR and totals from dgCMatrix slots. +// Shared by both dense matrix and dense triangle functions. +struct FeatureCSR { + std::vector totals; + std::vector feat_offsets; + std::vector feat_items; + std::vector feat_vals; + int ncomp; + int nfeat; + int total_nnz; +}; + +static FeatureCSR build_feature_csr(const IntegerVector& xi, const IntegerVector& xp, + const NumericVector& xx, int nr, int nc, + bool transpose) { + FeatureCSR csr; + csr.ncomp = transpose ? nr : nc; + csr.nfeat = transpose ? nc : nr; + csr.totals.resize(csr.ncomp, 0.0); + + std::vector feat_count(csr.nfeat, 0); + if (!transpose) { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + csr.totals[col] += xx[idx]; + feat_count[xi[idx]]++; + } + } + } else { + for (int col = 0; col < nc; ++col) { + feat_count[col] = xp[col + 1] - xp[col]; + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + csr.totals[xi[idx]] += xx[idx]; + } + } + } + + csr.feat_offsets.resize(csr.nfeat + 1, 0); + for (int f = 0; f < csr.nfeat; ++f) + csr.feat_offsets[f + 1] = csr.feat_offsets[f] + feat_count[f]; + + csr.total_nnz = csr.feat_offsets[csr.nfeat]; + csr.feat_items.resize(csr.total_nnz); + csr.feat_vals.resize(csr.total_nnz); + std::vector feat_fill(csr.feat_offsets.begin(), csr.feat_offsets.end()); + + if (!transpose) { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int row = xi[idx]; + const int dest = feat_fill[row]++; + csr.feat_items[dest] = col; + csr.feat_vals[dest] = xx[idx]; + } + } + } else { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int dest = feat_fill[col]++; + csr.feat_items[dest] = xi[idx]; + csr.feat_vals[dest] = xx[idx]; + } + } + } + + return csr; +} + +//' Dense weighted Jaccard similarity via C++ +//' +//' Compute the full weighted Jaccard similarity matrix for a +//' \link[Matrix:dgCMatrix-class]{dgCMatrix} using +//' an adaptive dense accumulation strategy. For small output matrices, uses a +//' feature-oriented loop; for large outputs, switches to a column-oriented +//' loop for better cache performance. +//' +//' @param x A \link[Matrix:dgCMatrix-class]{dgCMatrix} (sparse column-compressed matrix) +//' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare +//' rows +//' @param threads Number of threads (default 4). Set to 0 for all cores. +//' @param triangle If \code{TRUE}, return only the lower triangle as a flat +//' numeric vector in \code{\link{dist}} layout. If \code{FALSE} (default), +//' return a full square matrix. +//' @param distance If \code{TRUE}, return distance (\code{1 - similarity}) +//' instead of similarity. Default \code{FALSE}. +//' @return A dense numeric similarity matrix, or a numeric vector in +//' \code{\link{dist}} layout when \code{triangle = TRUE}. +//' @export +// [[Rcpp::export]] +SEXP c_weighted_jaccard_dense(const S4& x, bool transpose = false, + int threads = 4, bool triangle = false, + bool distance = false) { + IntegerVector dims = x.slot("Dim"); + IntegerVector xi = x.slot("i"); + IntegerVector xp = x.slot("p"); + NumericVector xx = x.slot("x"); + + const int nr = dims[0]; + const int nc = dims[1]; + + FeatureCSR csr = build_feature_csr(xi, xp, xx, nr, nc, transpose); + const int ncomp = csr.ncomp; + const int nfeat = csr.nfeat; + const size_t nT = (threads > 0) ? static_cast(threads) + : std::thread::hardware_concurrency(); + + if (triangle) { + // --- Triangle path: output is a flat vector in dist layout --- + // dist position for pair (r,c) with r>c (1-based): + // pos = (c-1)*n - c*(c-1)/2 + (r-c) [1-based] + // We use 0-based: for (ca, cb) with ca > cb: + // pos = cb*n - cb*(cb+1)/2 + (ca-cb) - 1 [0-based] + // which simplifies to: cb*(ncomp-1) - cb*(cb-1)/2 + (ca-1) ... let's use a + // direct formula. + + const std::size_t dist_len = static_cast(ncomp) * (ncomp - 1) / 2; + NumericVector out(dist_len, 0.0); + double* out_ptr = out.begin(); + const double* totals_ptr = csr.totals.data(); + + // Build item->features CSR (always needed for colwise triangle) + std::vector item_count(ncomp, 0); + for (int f = 0; f < nfeat; ++f) + for (int idx = csr.feat_offsets[f]; idx < csr.feat_offsets[f + 1]; ++idx) + item_count[csr.feat_items[idx]]++; + + std::vector item_offsets(ncomp + 1, 0); + for (int c = 0; c < ncomp; ++c) + item_offsets[c + 1] = item_offsets[c] + item_count[c]; + + std::vector item_feats(csr.total_nnz); + std::vector item_vals(csr.total_nnz); + std::vector item_fill(item_offsets.begin(), item_offsets.end()); + + for (int f = 0; f < nfeat; ++f) { + for (int idx = csr.feat_offsets[f]; idx < csr.feat_offsets[f + 1]; ++idx) { + const int item = csr.feat_items[idx]; + const int dest = item_fill[item]++; + item_feats[dest] = f; + item_vals[dest] = csr.feat_vals[idx]; + } + } + + const int* feat_items_ptr = csr.feat_items.data(); + const double* feat_vals_ptr = csr.feat_vals.data(); + const int* feat_offsets_ptr = csr.feat_offsets.data(); + const int* item_feats_ptr = item_feats.data(); + const double* item_vals_ptr = item_vals.data(); + const int* item_offsets_ptr = item_offsets.data(); + + // For column cb, accumulate min-sums for rows ca > cb only, + // writing directly into dist layout. + // dist index for (ca, cb) with ca > cb (0-based items, 0-based output): + // pos = cb * ncomp - cb*(cb+1)/2 + (ca - cb - 1) + RcppThread::parallelFor(0, ncomp - 1, [=](int cb) { + const std::size_t base = static_cast(cb) * ncomp + - static_cast(cb) * (cb + 1) / 2 + - static_cast(cb) - 1; + // Thread-local accumulator for this column's min-sums (only rows > cb) + std::vector col_ms(ncomp, 0.0); + + for (int fi = item_offsets_ptr[cb]; fi < item_offsets_ptr[cb + 1]; ++fi) { + const int f = item_feats_ptr[fi]; + const double vb = item_vals_ptr[fi]; + for (int idx = feat_offsets_ptr[f]; idx < feat_offsets_ptr[f + 1]; ++idx) { + const int ca = feat_items_ptr[idx]; + if (ca > cb) + col_ms[ca] += std::min(feat_vals_ptr[idx], vb); + } + } + + // Convert to similarity or distance and store + const double tb = totals_ptr[cb]; + for (int ca = cb + 1; ca < ncomp; ++ca) { + const double ms = col_ms[ca]; + const double denom = totals_ptr[ca] + tb - ms; + double sim = (denom > 0.0) ? ms / denom : 0.0; + out_ptr[base + ca] = distance ? (1.0 - sim) : sim; + } + }, nT); + + // Return as a dist object + out.attr("class") = "dist"; + out.attr("Size") = ncomp; + out.attr("Diag") = false; + out.attr("Upper") = false; + return out; + + } else { + // --- Full matrix path (existing) --- + NumericMatrix out(ncomp, ncomp); + + // Threshold: ncomp^2 * 8 bytes > ~12MB (typical L3 share per core) + const bool use_colwise = (static_cast(ncomp) * ncomp * 8 > 12 * 1024 * 1024); + + if (use_colwise) { + std::vector item_count(ncomp, 0); + for (int f = 0; f < nfeat; ++f) + for (int idx = csr.feat_offsets[f]; idx < csr.feat_offsets[f + 1]; ++idx) + item_count[csr.feat_items[idx]]++; + + std::vector item_offsets(ncomp + 1, 0); + for (int c = 0; c < ncomp; ++c) + item_offsets[c + 1] = item_offsets[c] + item_count[c]; + + std::vector item_feats(csr.total_nnz); + std::vector item_vals(csr.total_nnz); + std::vector item_fill(item_offsets.begin(), item_offsets.end()); + + for (int f = 0; f < nfeat; ++f) { + for (int idx = csr.feat_offsets[f]; idx < csr.feat_offsets[f + 1]; ++idx) { + const int item = csr.feat_items[idx]; + const int dest = item_fill[item]++; + item_feats[dest] = f; + item_vals[dest] = csr.feat_vals[idx]; + } + } + + double* out_ptr = &out[0]; + const int* feat_items_ptr = csr.feat_items.data(); + const double* feat_vals_ptr = csr.feat_vals.data(); + const int* feat_offsets_ptr = csr.feat_offsets.data(); + const int* item_feats_ptr = item_feats.data(); + const double* item_vals_ptr = item_vals.data(); + const int* item_offsets_ptr = item_offsets.data(); + + RcppThread::parallelFor(0, ncomp, [&](int cb) { + double* col = out_ptr + static_cast(cb) * ncomp; + for (int fi = item_offsets_ptr[cb]; fi < item_offsets_ptr[cb + 1]; ++fi) { + const int f = item_feats_ptr[fi]; + const double vb = item_vals_ptr[fi]; + for (int idx = feat_offsets_ptr[f]; idx < feat_offsets_ptr[f + 1]; ++idx) { + col[feat_items_ptr[idx]] += std::min(feat_vals_ptr[idx], vb); + } + } + }, nT); + } else { + // Feature-oriented: iterate features, scatter into output + for (int f = 0; f < nfeat; ++f) { + const int start = csr.feat_offsets[f]; + const int end = csr.feat_offsets[f + 1]; + const int k = end - start; + + for (int a = 0; a < k; ++a) { + const int ca = csr.feat_items[start + a]; + const double va = csr.feat_vals[start + a]; + out[static_cast(ca) * ncomp + ca] += va; + for (int b = a + 1; b < k; ++b) { + const int cb = csr.feat_items[start + b]; + const double mv = std::min(va, csr.feat_vals[start + b]); + out[static_cast(ca) * ncomp + cb] += mv; + out[static_cast(cb) * ncomp + ca] += mv; + } + } + } + } + + // Convert min_sums to similarity (or distance) + { + double* out_ptr = &out[0]; + const double* totals_ptr = csr.totals.data(); + RcppThread::parallelFor(0, ncomp, [=](int cb) { + for (int ca = 0; ca < ncomp; ++ca) { + const std::size_t pos = static_cast(cb) * ncomp + ca; + if (ca == cb) { + out_ptr[pos] = distance ? 0.0 : 1.0; + } else { + const double ms = out_ptr[pos]; + const double denom = totals_ptr[ca] + totals_ptr[cb] - ms; + double sim = (denom > 0.0) ? ms / denom : 0.0; + out_ptr[pos] = distance ? (1.0 - sim) : sim; + } + } + }, nT); + } + + return out; + } +} diff --git a/tests/testthat/test-weighted-jaccard.R b/tests/testthat/test-weighted-jaccard.R new file mode 100644 index 0000000..06875ca --- /dev/null +++ b/tests/testthat/test-weighted-jaccard.R @@ -0,0 +1,135 @@ +test_that("c_weighted_jaccard_dense computes correct similarity", { + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + # Naive reference: sim[a,b] = sum(pmin(col_a, col_b)) / sum(pmax(col_a, col_b)) + dm <- as.matrix(m) + n <- ncol(dm) + ref <- matrix(0, n, n) + for (a in seq_len(n)) { + for (b in seq_len(n)) { + mins <- sum(pmin(dm[, a], dm[, b])) + maxs <- sum(pmax(dm[, a], dm[, b])) + ref[a, b] <- if (maxs > 0) mins / maxs else 0 + } + } + diag(ref) <- 1 + + sim <- c_weighted_jaccard_dense(m, transpose = FALSE) + expect_equal(sim, ref, tolerance = 1e-12) +}) + +test_that("c_weighted_jaccard_dense transpose works", { + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + dm <- as.matrix(m) + nr <- nrow(dm) + ref_t <- matrix(0, nr, nr) + for (a in seq_len(nr)) { + for (b in seq_len(nr)) { + mins <- sum(pmin(dm[a, ], dm[b, ])) + maxs <- sum(pmax(dm[a, ], dm[b, ])) + ref_t[a, b] <- if (maxs > 0) mins / maxs else 0 + } + } + diag(ref_t) <- 1 + + sim_t <- c_weighted_jaccard_dense(m, transpose = TRUE) + expect_equal(sim_t, ref_t, tolerance = 1e-12) +}) + +test_that("c_weighted_jaccard_sparse matches dense", { + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + sim_dense <- c_weighted_jaccard_dense(m, transpose = FALSE) + sim_sparse <- c_weighted_jaccard_sparse(m, transpose = FALSE, display_progress = FALSE) + expect_equal(as.matrix(sim_sparse), sim_dense, tolerance = 1e-12) + + sim_dense_t <- c_weighted_jaccard_dense(m, transpose = TRUE) + sim_sparse_t <- c_weighted_jaccard_sparse(m, transpose = TRUE, display_progress = FALSE) + expect_equal(as.matrix(sim_sparse_t), sim_dense_t, tolerance = 1e-12) +}) + +test_that("c_weighted_jaccard_dense triangle returns dist object", { + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + full <- c_weighted_jaccard_dense(m) + + # triangle=TRUE returns a dist object + tri <- c_weighted_jaccard_dense(m, triangle = TRUE) + expect_s3_class(tri, "dist") + expect_equal(attr(tri, "Size"), ncol(m)) + + # triangle similarity matches as.dist of full similarity + # (as.dist extracts lower triangle; dist stores 1-sim by convention, + # but here we just compare the raw values) + ref_lower <- full[lower.tri(full)] + expect_equal(as.numeric(tri), ref_lower, tolerance = 1e-12) + + # triangle + distance matches as.dist(1 - full) + tri_d <- c_weighted_jaccard_dense(m, triangle = TRUE, distance = TRUE) + expect_s3_class(tri_d, "dist") + expect_equal(as.numeric(tri_d), 1 - ref_lower, tolerance = 1e-12) +}) + +test_that("c_weighted_jaccard_dense distance works for full matrix", { + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + full <- c_weighted_jaccard_dense(m) + full_d <- c_weighted_jaccard_dense(m, distance = TRUE) + expect_equal(full_d, 1 - full, tolerance = 1e-12) +}) + +test_that("c_weighted_jaccard_sparse triangle and distance args", { + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + # Default: dgCMatrix + sp <- c_weighted_jaccard_sparse(m, display_progress = FALSE) + expect_true(inherits(sp, "dgCMatrix")) + + # triangle=TRUE: dsCMatrix + sp_tri <- c_weighted_jaccard_sparse(m, display_progress = FALSE, triangle = TRUE) + expect_true(inherits(sp_tri, "dsCMatrix")) + expect_equal(as.matrix(sp_tri), as.matrix(sp), tolerance = 1e-12) + + # distance=TRUE warns + expect_warning( + sp_d <- c_weighted_jaccard_sparse(m, display_progress = FALSE, distance = TRUE), + "distance=TRUE" + ) + # Stored entries are correct (1-sim); structural zeros remain 0 instead of 1 + # — this is the inherent limitation the warning describes + full <- c_weighted_jaccard_dense(m) + ref_d <- 1 - full + ref_d[full == 0 & row(full) != col(full)] <- 0 # zero out unstored positions + expect_equal(as.matrix(sp_d), ref_d, tolerance = 1e-12) +})