diff --git a/datasketches/src/theta/jaccard_similarity.rs b/datasketches/src/theta/jaccard_similarity.rs new file mode 100644 index 0000000..e80c8a1 --- /dev/null +++ b/datasketches/src/theta/jaccard_similarity.rs @@ -0,0 +1,298 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Jaccard similarity for Theta sketches. +//! +//! The Jaccard similarity index is `J(A, B) = |A intersection B| / |A union B|`. +//! It measures how similar two sketches are: `1.0` means they are considered equal, +//! `0.0` means they are disjoint, and `0.95` means the overlap is 95% of the union. + +use crate::error::Error; +use crate::hash::DEFAULT_UPDATE_SEED; +use crate::theta::CompactThetaSketch; +use crate::theta::ThetaIntersection; +use crate::theta::ThetaSketchView; +use crate::theta::union::ThetaUnion; + +const NUM_STD_DEVS: f64 = 2.0; + +/// Jaccard similarity result for two Theta sketches. +/// +/// The entries are lower bound, estimate, and upper bound, matching the C++ +/// `theta_jaccard_similarity::jaccard` result order. The bounds use a 95.4% +/// confidence interval, equivalent to +/- 2 standard deviations. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct JaccardSimilarity { + /// Approximate lower bound for the Jaccard index. + pub lower_bound: f64, + /// Estimate of the Jaccard index. + pub estimate: f64, + /// Approximate upper bound for the Jaccard index. + pub upper_bound: f64, +} + +impl JaccardSimilarity { + fn exact(value: f64) -> Self { + Self { + lower_bound: value, + estimate: value, + upper_bound: value, + } + } +} + +/// Computes Jaccard similarity between Theta sketches. +pub struct ThetaJaccardSimilarity; + +impl ThetaJaccardSimilarity { + /// Computes the Jaccard similarity index with the default update seed. + /// + /// The returned value contains lower bound, estimate, and upper bound. For very large + /// sketches, where the configured nominal entries are `2^25` or `2^26`, this method may + /// produce unstable results. + pub fn jaccard( + sketch_a: &A, + sketch_b: &B, + ) -> Result { + Self::jaccard_with_seed(sketch_a, sketch_b, DEFAULT_UPDATE_SEED) + } + + /// Computes the Jaccard similarity index with an explicit update seed. + /// + /// The returned value contains lower bound, estimate, and upper bound. For very large + /// sketches, where the configured nominal entries are `2^25` or `2^26`, this method may + /// produce unstable results. + /// + /// Returns an error if a non-empty sketch was built with a different seed. + pub fn jaccard_with_seed( + sketch_a: &A, + sketch_b: &B, + seed: u64, + ) -> Result { + if sketch_a.is_empty() && sketch_b.is_empty() { + return Ok(JaccardSimilarity::exact(1.0)); + } + if sketch_a.is_empty() || sketch_b.is_empty() { + return Ok(JaccardSimilarity::exact(0.0)); + } + + let union = ThetaUnion::compute(sketch_a, sketch_b, seed)?; + if identical_sets(sketch_a, sketch_b, &union) { + return Ok(JaccardSimilarity::exact(1.0)); + } + + let mut intersection = ThetaIntersection::new(seed); + intersection.update(sketch_a)?; + intersection.update(sketch_b)?; + // Ensure the numerator sketch is a subset of the denominator sketch used by + // the ratio bounds calculation. + intersection.update(&union)?; + let intersection = intersection.result_with_ordered(false); + + ratio_bounds(&union, &intersection) + } +} + +fn identical_sets( + sketch_a: &A, + sketch_b: &B, + union: &CompactThetaSketch, +) -> bool { + union.num_retained() == sketch_a.num_retained() + && union.num_retained() == sketch_b.num_retained() + && union.theta64() == sketch_a.theta64() + && union.theta64() == sketch_b.theta64() +} + +fn ratio_bounds( + sketch_a: &CompactThetaSketch, + sketch_b: &CompactThetaSketch, +) -> Result { + let theta_a = sketch_a.theta64(); + let theta_b = sketch_b.theta64(); + if theta_b > theta_a { + return Err(Error::invalid_argument(format!( + "theta_a must be <= theta_b: theta_a={theta_a}, theta_b={theta_b}" + ))); + } + + let count_b = sketch_b.num_retained() as u64; + let count_a = if theta_a == theta_b { + sketch_a.num_retained() as u64 + } else { + sketch_a.iter().filter(|&hash| hash < theta_b).count() as u64 + }; + + if count_a == 0 { + return Ok(JaccardSimilarity { + lower_bound: 0.0, + estimate: 0.5, + upper_bound: 1.0, + }); + } + + let f = sketch_b.theta(); + Ok(JaccardSimilarity { + lower_bound: lower_bound_for_b_over_a(count_a, count_b, f)?, + estimate: count_b as f64 / count_a as f64, + upper_bound: upper_bound_for_b_over_a(count_a, count_b, f)?, + }) +} + +fn lower_bound_for_b_over_a(a: u64, b: u64, f: f64) -> Result { + check_ratio_inputs(a, b, f)?; + if a == 0 { + return Ok(0.0); + } + if f == 1.0 { + return Ok(b as f64 / a as f64); + } + Ok(approximate_lower_bound_on_p( + a, + b, + NUM_STD_DEVS * hacky_adjuster(f), + )) +} + +fn upper_bound_for_b_over_a(a: u64, b: u64, f: f64) -> Result { + check_ratio_inputs(a, b, f)?; + if a == 0 { + return Ok(1.0); + } + if f == 1.0 { + return Ok(b as f64 / a as f64); + } + Ok(approximate_upper_bound_on_p( + a, + b, + NUM_STD_DEVS * hacky_adjuster(f), + )) +} + +fn check_ratio_inputs(a: u64, b: u64, f: f64) -> Result<(), Error> { + if a < b { + return Err(Error::invalid_argument(format!( + "a must be >= b: a = {a}, b = {b}" + ))); + } + if !(0.0..=1.0).contains(&f) || f == 0.0 { + return Err(Error::invalid_argument(format!( + "f must be in the range (0.0, 1.0], got {f}" + ))); + } + Ok(()) +} + +fn hacky_adjuster(f: f64) -> f64 { + let tmp = (1.0 - f).sqrt(); + if f <= 0.5 { + tmp + } else { + tmp + (0.01 * (f - 0.5)) + } +} + +fn approximate_lower_bound_on_p(n: u64, k: u64, num_std_devs: f64) -> f64 { + if n == 0 || k == 0 { + 0.0 + } else if k == 1 { + exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs)) + } else if k == n { + exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs)) + } else { + let x = abramowitz_stegun_formula_26p5p22((n - k) as f64 + 1.0, k as f64, -num_std_devs); + 1.0 - x + } +} + +fn approximate_upper_bound_on_p(n: u64, k: u64, num_std_devs: f64) -> f64 { + if n == 0 || k == n { + 1.0 + } else if k == n - 1 { + exact_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)) + } else if k == 0 { + exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)) + } else { + let x = abramowitz_stegun_formula_26p5p22((n - k) as f64, k as f64 + 1.0, num_std_devs); + 1.0 - x + } +} + +fn delta_of_num_stdevs(kappa: f64) -> f64 { + normal_cdf(-kappa) +} + +fn normal_cdf(x: f64) -> f64 { + 0.5 * (1.0 + erf(x / 2.0_f64.sqrt())) +} + +fn erf(x: f64) -> f64 { + if x < 0.0 { + -erf_of_nonneg(-x) + } else { + erf_of_nonneg(x) + } +} + +fn erf_of_nonneg(x: f64) -> f64 { + let a1 = 0.0705230784; + let a2 = 0.0422820123; + let a3 = 0.0092705272; + let a4 = 0.0001520143; + let a5 = 0.0002765672; + let a6 = 0.0000430638; + let x2 = x * x; + let x3 = x2 * x; + let x4 = x2 * x2; + let x5 = x2 * x3; + let x6 = x3 * x3; + let sum = 1.0 + (a1 * x) + (a2 * x2) + (a3 * x3) + (a4 * x4) + (a5 * x5) + (a6 * x6); + let sum2 = sum * sum; + let sum4 = sum2 * sum2; + let sum8 = sum4 * sum4; + let sum16 = sum8 * sum8; + 1.0 - (1.0 / sum16) +} + +fn abramowitz_stegun_formula_26p5p22(a: f64, b: f64, yp: f64) -> f64 { + let b2m1 = (2.0 * b) - 1.0; + let a2m1 = (2.0 * a) - 1.0; + let lambda = ((yp * yp) - 3.0) / 6.0; + let htmp = (1.0 / a2m1) + (1.0 / b2m1); + let h = 2.0 / htmp; + let term1 = (yp * (h + lambda).sqrt()) / h; + let term2 = (1.0 / b2m1) - (1.0 / a2m1); + let term3 = (lambda + (5.0 / 6.0)) - (2.0 / (3.0 * h)); + let w = term1 - (term2 * term3); + a / (a + (b * (2.0 * w).exp())) +} + +fn exact_upper_bound_on_p_k_eq_zero(n: u64, delta: f64) -> f64 { + 1.0 - delta.powf(1.0 / n as f64) +} + +fn exact_lower_bound_on_p_k_eq_n(n: u64, delta: f64) -> f64 { + delta.powf(1.0 / n as f64) +} + +fn exact_lower_bound_on_p_k_eq_1(n: u64, delta: f64) -> f64 { + 1.0 - (1.0 - delta).powf(1.0 / n as f64) +} + +fn exact_upper_bound_on_p_k_eq_minusone(n: u64, delta: f64) -> f64 { + (1.0 - delta).powf(1.0 / n as f64) +} diff --git a/datasketches/src/theta/mod.rs b/datasketches/src/theta/mod.rs index 03b5e2a..4bd466e 100644 --- a/datasketches/src/theta/mod.rs +++ b/datasketches/src/theta/mod.rs @@ -42,10 +42,14 @@ mod bit_pack; mod hash_table; mod intersection; +mod jaccard_similarity; mod serialization; mod sketch; +mod union; pub use self::intersection::ThetaIntersection; +pub use self::jaccard_similarity::JaccardSimilarity; +pub use self::jaccard_similarity::ThetaJaccardSimilarity; pub use self::sketch::CompactThetaSketch; pub use self::sketch::ThetaSketch; pub use self::sketch::ThetaSketchBuilder; diff --git a/datasketches/src/theta/union.rs b/datasketches/src/theta/union.rs new file mode 100644 index 0000000..cbf5ad1 --- /dev/null +++ b/datasketches/src/theta/union.rs @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::BTreeSet; + +use crate::error::Error; +use crate::hash::compute_seed_hash; +use crate::theta::CompactThetaSketch; +use crate::theta::ThetaSketchView; + +pub(super) struct ThetaUnion; + +impl ThetaUnion { + pub(super) fn compute( + sketch_a: &A, + sketch_b: &B, + seed: u64, + ) -> Result { + let seed_hash = compute_seed_hash(seed); + validate_seed_hash(sketch_a, seed_hash, "sketch A")?; + validate_seed_hash(sketch_b, seed_hash, "sketch B")?; + + let theta = sketch_a.theta64().min(sketch_b.theta64()); + let mut entries = BTreeSet::new(); + entries.extend(sketch_a.iter().filter(|&hash| hash < theta)); + entries.extend(sketch_b.iter().filter(|&hash| hash < theta)); + + Ok(CompactThetaSketch::from_parts( + entries.into_iter().collect(), + theta, + seed_hash, + false, + sketch_a.is_empty() && sketch_b.is_empty(), + )) + } +} + +fn validate_seed_hash( + sketch: &S, + expected_seed_hash: u16, + label: &str, +) -> Result<(), Error> { + if !sketch.is_empty() && sketch.seed_hash() != expected_seed_hash { + return Err(Error::invalid_argument(format!( + "incompatible seed hash for {label}: expected {}, got {}", + expected_seed_hash, + sketch.seed_hash() + ))); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::hash::DEFAULT_UPDATE_SEED; + use crate::theta::ThetaSketch; + use crate::theta::union::ThetaUnion; + + fn sketch_with_range(start: u64, count: u64) -> ThetaSketch { + let mut sketch = ThetaSketch::builder().build(); + for value in start..start + count { + sketch.update(value); + } + sketch + } + + #[test] + fn exact_mode_half_overlap() { + let sketch_a = sketch_with_range(0, 1000); + let sketch_b = sketch_with_range(500, 1000); + + let union = ThetaUnion::compute(&sketch_a, &sketch_b, DEFAULT_UPDATE_SEED).unwrap(); + + assert!(!union.is_empty()); + assert!(!union.is_estimation_mode()); + assert_eq!(union.estimate(), 1500.0); + } + + #[test] + fn empty_inputs_produce_empty_union() { + let sketch_a = ThetaSketch::builder().build(); + let sketch_b = ThetaSketch::builder().build(); + + let union = ThetaUnion::compute(&sketch_a, &sketch_b, DEFAULT_UPDATE_SEED).unwrap(); + + assert!(union.is_empty()); + assert!(!union.is_estimation_mode()); + assert_eq!(union.num_retained(), 0); + } + + #[test] + fn seed_mismatch_on_non_empty_sketch_returns_error() { + let mut sketch_a = ThetaSketch::builder().seed(123).build(); + sketch_a.update(1u64); + let sketch_b = ThetaSketch::builder().build(); + + assert!(ThetaUnion::compute(&sketch_a, &sketch_b, DEFAULT_UPDATE_SEED).is_err()); + } +} diff --git a/datasketches/tests/theta_jaccard_similarity_test.rs b/datasketches/tests/theta_jaccard_similarity_test.rs new file mode 100644 index 0000000..aa808d1 --- /dev/null +++ b/datasketches/tests/theta_jaccard_similarity_test.rs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#![cfg(feature = "theta")] + +use datasketches::theta::ThetaJaccardSimilarity; +use datasketches::theta::ThetaSketch; + +fn assert_jaccard_exact(actual: datasketches::theta::JaccardSimilarity, expected: f64) { + assert_eq!(actual.lower_bound, expected); + assert_eq!(actual.estimate, expected); + assert_eq!(actual.upper_bound, expected); +} + +fn assert_close(actual: f64, expected: f64, margin: f64) { + assert!( + (actual - expected).abs() <= margin, + "actual={actual}, expected={expected}, margin={margin}" + ); +} + +fn sketch_with_range(start: u64, count: u64) -> ThetaSketch { + let mut sketch = ThetaSketch::builder().build(); + for value in start..start + count { + sketch.update(value); + } + sketch +} + +fn sketch_with_range_and_seed(start: u64, count: u64, seed: u64) -> ThetaSketch { + let mut sketch = ThetaSketch::builder().seed(seed).build(); + for value in start..start + count { + sketch.update(value); + } + sketch +} + +#[test] +fn test_empty() { + let sketch_a = ThetaSketch::builder().build(); + let sketch_b = ThetaSketch::builder().build(); + + let jaccard = ThetaJaccardSimilarity::jaccard(&sketch_a, &sketch_b).unwrap(); + + assert_jaccard_exact(jaccard, 1.0); +} + +#[test] +fn test_same_sketch_exact_mode() { + let sketch = sketch_with_range(0, 1000); + + let jaccard = ThetaJaccardSimilarity::jaccard(&sketch, &sketch).unwrap(); + assert_jaccard_exact(jaccard, 1.0); + + let jaccard = + ThetaJaccardSimilarity::jaccard(&sketch.compact(true), &sketch.compact(true)).unwrap(); + assert_jaccard_exact(jaccard, 1.0); +} + +#[test] +fn test_full_overlap_exact_mode() { + let sketch_a = sketch_with_range(0, 1000); + let sketch_b = sketch_with_range(0, 1000); + + let jaccard = ThetaJaccardSimilarity::jaccard(&sketch_a, &sketch_b).unwrap(); + assert_jaccard_exact(jaccard, 1.0); + + let jaccard = + ThetaJaccardSimilarity::jaccard(&sketch_a.compact(true), &sketch_b.compact(true)).unwrap(); + assert_jaccard_exact(jaccard, 1.0); +} + +#[test] +fn test_disjoint_exact_mode() { + let sketch_a = sketch_with_range(0, 1000); + let sketch_b = sketch_with_range(1000, 1000); + + let jaccard = ThetaJaccardSimilarity::jaccard(&sketch_a, &sketch_b).unwrap(); + assert_jaccard_exact(jaccard, 0.0); + + let jaccard = + ThetaJaccardSimilarity::jaccard(&sketch_a.compact(true), &sketch_b.compact(true)).unwrap(); + assert_jaccard_exact(jaccard, 0.0); +} + +#[test] +fn test_half_overlap_estimation_mode() { + let sketch_a = sketch_with_range(0, 10000); + let sketch_b = sketch_with_range(5000, 10000); + + let jaccard = ThetaJaccardSimilarity::jaccard(&sketch_a, &sketch_b).unwrap(); + assert_close(jaccard.lower_bound, 0.33, 0.01); + assert_close(jaccard.estimate, 0.33, 0.01); + assert_close(jaccard.upper_bound, 0.33, 0.01); + + let jaccard = + ThetaJaccardSimilarity::jaccard(&sketch_a.compact(true), &sketch_b.compact(true)).unwrap(); + assert_close(jaccard.lower_bound, 0.33, 0.01); + assert_close(jaccard.estimate, 0.33, 0.01); + assert_close(jaccard.upper_bound, 0.33, 0.01); +} + +#[test] +fn test_half_overlap_estimation_mode_custom_seed() { + let seed = 123; + let sketch_a = sketch_with_range_and_seed(0, 10000, seed); + let sketch_b = sketch_with_range_and_seed(5000, 10000, seed); + + let jaccard = ThetaJaccardSimilarity::jaccard_with_seed(&sketch_a, &sketch_b, seed).unwrap(); + assert_close(jaccard.lower_bound, 0.33, 0.01); + assert_close(jaccard.estimate, 0.33, 0.01); + assert_close(jaccard.upper_bound, 0.33, 0.01); + + let jaccard = ThetaJaccardSimilarity::jaccard_with_seed( + &sketch_a.compact(true), + &sketch_b.compact(true), + seed, + ) + .unwrap(); + assert_close(jaccard.lower_bound, 0.33, 0.01); + assert_close(jaccard.estimate, 0.33, 0.01); + assert_close(jaccard.upper_bound, 0.33, 0.01); +} + +#[test] +fn test_seed_mismatch() { + let mut sketch_a = ThetaSketch::builder().build(); + sketch_a.update(1u64); + let mut sketch_b = ThetaSketch::builder().seed(123).build(); + sketch_b.update(1u64); + + assert!(ThetaJaccardSimilarity::jaccard(&sketch_a, &sketch_b).is_err()); +} diff --git a/typos.toml b/typos.toml index c9b22da..8d4c3e4 100644 --- a/typos.toml +++ b/typos.toml @@ -16,8 +16,8 @@ # under the License. [default.extend-words] -# False-Positive Abbreviations "PREINTS" = "PREINTS" +"htmp" = "htmp" [files] extend-exclude = []