Compare commits

..

1 Commits

Author SHA1 Message Date
Fred Navruzov
6229a8fe7d docs/vmanomaly: release v1.29.6 (#11132)
Update vmanomaly docs (/anomaly-detection) for patch release v1.29.6

PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11132
2026-06-17 20:09:17 +03:00
42 changed files with 21 additions and 4508 deletions

View File

@@ -1,49 +0,0 @@
package main
import (
"bytes"
"flag"
"io"
"sync"
"time"
"github.com/VictoriaMetrics/metrics"
)
var (
cardinalityMetricsWrites = metrics.NewCounter(`vmestimator_write_cardinality_metrics_total`)
cardinalityMetricsWriteDuration = metrics.NewFloatCounter(`vmestimator_write_cardinality_metrics_duration_seconds_total`)
cardinalityMetricsWriteBytes = metrics.NewCounter(`vmestimator_write_cardinality_metrics_size_bytes_total`)
cardinalityCacheMu sync.Mutex
cardinalityMetricsCacheAt time.Time
cardinalityMetricsCache []byte
cardinalityMetricsCacheTTL = flag.Duration("cardinalityMetrics.cacheTTL", time.Second*30, "Duration for caching cardinality metrics response")
cardinalityMetricsExposeAt = flag.String(`cardinalityMetrics.exposeAt`, `/metrics`, "HTTP path for exposing cardinality metrics. "+
"If set to the default /metrics, cardinality metrics are merged with regular metrics and exposed together. "+
"If set to a different path, only cardinality metrics are exposed at that endpoint. "+
"If set to an empty value, cardinality metrics are not exposed via HTTP at all.")
)
func writeCardinalityMetrics(w io.Writer, es []*estimator) {
startTime := time.Now()
cardinalityCacheMu.Lock()
if time.Since(cardinalityMetricsCacheAt) >= *cardinalityMetricsCacheTTL || *cardinalityMetricsCacheTTL == 0 {
plain := bytes.NewBuffer(cardinalityMetricsCache[:0])
for _, e := range es {
e.writeMetrics(plain)
}
cardinalityMetricsCache = plain.Bytes()
cardinalityMetricsCacheAt = time.Now()
}
cm := make([]byte, len(cardinalityMetricsCache))
copy(cm, cardinalityMetricsCache)
cardinalityCacheMu.Unlock()
_, _ = w.Write(cm)
cardinalityMetricsWrites.Inc()
cardinalityMetricsWriteDuration.Add(time.Since(startTime).Seconds())
cardinalityMetricsWriteBytes.Add(len(cm))
}

View File

@@ -1,43 +0,0 @@
package main
import (
"fmt"
"os"
"sort"
"time"
"gopkg.in/yaml.v2"
)
type Config struct {
Streams []EstimatorConfig `yaml:"streams"`
}
type EstimatorConfig struct {
GroupBy []string `yaml:"group_by"`
GroupLimit int `yaml:"group_limit"`
Labels map[string]string `yaml:"labels"`
Interval time.Duration `yaml:"interval"`
Buckets int `yaml:"buckets"`
HLLPrecision uint8 `yaml:"hll_precision"`
HLLSparse *bool `yaml:"hll_sparse"`
}
func loadConfig(path string) (*Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("cannot read config file %q: %w", path, err)
}
var cfg Config
if err := yaml.UnmarshalStrict(data, &cfg); err != nil {
return nil, fmt.Errorf("cannot parse config file %q: %w", path, err)
}
for _, stream := range cfg.Streams {
sort.Strings(stream.GroupBy)
if stream.HLLPrecision != 0 && (stream.HLLPrecision < 4 || stream.HLLPrecision > 18) {
return nil, fmt.Errorf("invalid precision %d: must be in range [4, 18]", stream.HLLPrecision)
}
}
return &cfg, nil
}

View File

@@ -1,508 +0,0 @@
package main
import (
"fmt"
"io"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
"github.com/axiomhq/hyperloglog"
"github.com/dgryski/go-metro"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmestimator/protoparser"
)
type estimator struct {
groupBy []string
groupByKeysLabel string
groupLimit int64
groupSize atomic.Int64
groupRejectedMu sync.Mutex
groupRejectedSketch *hyperloglog.Sketch
groupRejectedSketchPrev *hyperloglog.Sketch
buckets []*estimatorBucket
metricsSet *metrics.Set
insertTotal *metrics.Counter
stopCh chan struct{}
}
func newEstimator(cfg EstimatorConfig) (*estimator, error) {
if cfg.Interval == 0 {
cfg.Interval = time.Minute * 5
}
if cfg.GroupLimit <= 0 {
cfg.GroupLimit = 10000
}
if cfg.Buckets <= 0 {
cfg.Buckets = min(64, 2*cgroup.AvailableCPUs())
}
if cfg.HLLPrecision == 0 {
cfg.HLLPrecision = 14
}
if cfg.HLLSparse == nil {
cfg.HLLSparse = new(true)
}
metricPrefix := fmt.Sprintf("cardinality_estimate{interval=%q", cfg.Interval)
if len(cfg.Labels) > 0 {
keys := make([]string, 0, len(cfg.Labels))
for k := range cfg.Labels {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
metricPrefix += fmt.Sprintf(",%s=%q", k, cfg.Labels[k])
}
}
groupByKeysLabel := "__global__"
if len(cfg.GroupBy) > 0 {
groupByKeysLabel = strings.Join(cfg.GroupBy, `,`)
}
e := &estimator{
groupBy: cfg.GroupBy,
groupByKeysLabel: groupByKeysLabel,
groupLimit: int64(cfg.GroupLimit),
groupRejectedSketch: mustNewGroupRejectSketch(),
groupRejectedSketchPrev: mustNewGroupRejectSketch(),
buckets: make([]*estimatorBucket, cfg.Buckets),
metricsSet: metrics.NewSet(),
stopCh: make(chan struct{}),
}
e.insertTotal = e.metricsSet.NewCounter(
fmt.Sprintf(`vmestimator_estimator_insert_total{group_by_keys=%q}`, e.groupByKeysLabel),
)
e.metricsSet.NewGauge(fmt.Sprintf(`vmestimator_estimator_group_rejected_size{group_by_keys=%q}`, e.groupByKeysLabel), func() float64 {
e.groupRejectedMu.Lock()
defer e.groupRejectedMu.Unlock()
return float64(e.groupRejectedSketch.Estimate())
})
for i := 0; i < len(e.buckets); i++ {
eb := &estimatorBucket{
groupBy: cfg.GroupBy,
extraLabels: cfg.Labels,
interval: cfg.Interval,
metricPrefix: metricPrefix,
groupByKeysLabel: groupByKeysLabel,
groupLimit: int64(cfg.GroupLimit),
groupSize: &e.groupSize,
groupRejectedMu: &e.groupRejectedMu,
groupRejectedSketch: e.groupRejectedSketch,
precision: cfg.HLLPrecision,
sparse: *cfg.HLLSparse,
}
if len(cfg.GroupBy) == 0 {
eb.sketch = eb.newSketch()
} else {
eb.groups = make(map[string]groupSketch)
eb.prevGroups = make(map[string]groupSketch)
e.metricsSet.NewGauge(fmt.Sprintf(`vmestimator_estimator_group_size{group_by_keys=%q,bucket="%d"}`, eb.groupByKeysLabel, i), func() float64 {
return float64(eb.groupSize.Load())
})
e.metricsSet.NewGauge(fmt.Sprintf(`vmestimator_estimator_group_limit{group_by_keys=%q,bucket="%d"}`, eb.groupByKeysLabel, i), func() float64 {
return float64(eb.groupLimit)
})
}
e.buckets[i] = eb
}
go e.runRotation(cfg.Interval)
metrics.RegisterSet(e.metricsSet)
return e, nil
}
func (e *estimator) stop() {
close(e.stopCh)
e.metricsSet.UnregisterAllMetrics()
}
var groupValuesPool = sync.Pool{}
func getGroupValuesKeySlice() *[]byte {
v0 := groupValuesPool.Get()
if v0 == nil {
v := make([]byte, 128)
return &v
}
return v0.(*[]byte)
}
func putGroupValuesSlice(key *[]byte) {
if key == nil {
return
}
*key = (*key)[:0]
groupValuesPool.Put(key)
}
func (e *estimator) insertMany(tss []protoparser.TimeSerie) {
bucketsNum := uint64(len(e.buckets))
groupValuesKeyP := getGroupValuesKeySlice()
groupValuesKey := *groupValuesKeyP
defer func() {
*groupValuesKeyP = groupValuesKey
putGroupValuesSlice(groupValuesKeyP)
}()
groupValues := make([]string, len(e.groupBy))
var cnt int
for _, ts := range tss {
if len(e.groupBy) == 0 {
i := int(ts.Fingerprint % bucketsNum)
e.buckets[i].insert(ts, "", nil)
cnt++
continue
}
groupValuesKey = groupValuesKey[:0]
clear(groupValues)
var hasNames bool
for i, labelName := range e.groupBy {
if i > 0 {
groupValuesKey = append(groupValuesKey, ',')
}
for _, l := range ts.GroupLabels {
if l.Name == labelName {
hasNames = true
groupValuesKey = append(groupValuesKey, l.Value...)
groupValues[i] = l.Value
break
}
}
}
// time series does not contribute to this groupBy
if !hasNames {
continue
}
i := int(hash(groupValuesKey) % bucketsNum)
e.buckets[i].insert(ts, bytesutil.ToUnsafeString(groupValuesKey), groupValues)
cnt++
}
e.insertTotal.Add(cnt)
}
func (e *estimator) reset() {
e.groupSize.Store(0)
for _, b := range e.buckets {
b.reset()
}
e.groupRejectedMu.Lock()
e.groupRejectedSketch.Reset()
e.groupRejectedMu.Unlock()
}
func (e *estimator) writeMetrics(w io.Writer) {
eb0 := e.buckets[0]
if len(e.groupBy) == 0 {
formatBuf := make([]byte, 0, 1024)
resSK := eb0.newSketch()
for _, eb := range e.buckets {
eb.writeNoGroupMetric(resSK)
}
formatBuf = append(formatBuf, eb0.metricPrefix...)
formatBuf = append(formatBuf, `,group_by_keys="__global__"} `...)
formatBuf = strconv.AppendUint(formatBuf, resSK.Estimate(), 10)
formatBuf = append(formatBuf, "\n"...)
if _, err := w.Write(formatBuf); err != nil {
logger.Errorf("writing metrics failed: %s; written cardinality metrics might be incomplete or invalid", err)
}
return
}
formatBuf := make([]byte, 0, 16384)
formatBuf = append(formatBuf, eb0.metricPrefix...)
formatBuf = append(formatBuf, `,group_by_keys="`...)
formatBuf = append(formatBuf, eb0.groupByKeysLabel...)
formatBuf = append(formatBuf, `",group_by_values=`...)
prefixLen := len(formatBuf)
resSK := eb0.newSketch()
for _, eb := range e.buckets {
formatBuf = eb.writeGroupMetrics(w, resSK, formatBuf[:prefixLen])
}
groupSize := e.groupSize.Load()
if groupSize >= int64(float64(e.groupLimit)*0.8) {
e.groupRejectedMu.Lock()
res := mustNewGroupRejectSketch()
if err := res.Merge(e.groupRejectedSketch); err != nil {
logger.Fatalf("BUG: groupRejectedSketch merge failed: %s", err)
}
if err := res.Merge(e.groupRejectedSketchPrev); err != nil {
logger.Fatalf("BUG: groupRejectedSketchPrev merge failed: %s", err)
}
e.groupRejectedMu.Unlock()
groupSize += int64(res.Estimate())
}
formatBuf = formatBuf[:0]
formatBuf = append(formatBuf, eb0.metricPrefix...)
formatBuf = append(formatBuf, `,group_by_keys="__group__",group_by_values="`...)
formatBuf = append(formatBuf, eb0.groupByKeysLabel...)
formatBuf = append(formatBuf, `"} `...)
formatBuf = strconv.AppendInt(formatBuf, groupSize, 10)
formatBuf = append(formatBuf, "\n"...)
if _, err := w.Write(formatBuf); err != nil {
logger.Errorf("writing metrics failed: %s; written cardinality metrics might be incomplete or invalid", err)
}
}
func (e *estimator) runRotation(interval time.Duration) {
t := time.NewTicker(interval / 2)
defer t.Stop()
for {
select {
case <-t.C:
e.rotate()
case <-e.stopCh:
return
}
}
}
func (e *estimator) rotate() {
e.groupSize.Store(0)
var wg sync.WaitGroup
for i := range e.buckets {
wg.Go(e.buckets[i].rotate)
}
wg.Wait()
e.groupRejectedMu.Lock()
prevSK := e.groupRejectedSketchPrev
prevSK.Reset()
e.groupRejectedSketchPrev = e.groupRejectedSketch
e.groupRejectedSketch = prevSK
e.groupRejectedMu.Unlock()
}
type estimatorBucket struct {
mu sync.Mutex
groupBy []string
groupLimit int64
extraLabels map[string]string
interval time.Duration
metricPrefix string
groupByKeysLabel string
precision uint8
sparse bool
sketch *hyperloglog.Sketch
prevSketch *hyperloglog.Sketch
groupSize *atomic.Int64
groups map[string]groupSketch
prevGroups map[string]groupSketch
groupRejectedMu *sync.Mutex
groupRejectedSketch *hyperloglog.Sketch
}
func (eb *estimatorBucket) String() string {
return fmt.Sprintf(
"interval: %s; group_by: %v; extra_labels: %v", eb.interval, eb.groupBy, eb.extraLabels)
}
func (eb *estimatorBucket) reset() {
eb.mu.Lock()
defer eb.mu.Unlock()
if len(eb.groupBy) == 0 {
eb.prevSketch.Reset()
eb.sketch.Reset()
return
}
eb.groups = make(map[string]groupSketch)
eb.prevGroups = make(map[string]groupSketch)
}
func (eb *estimatorBucket) rotate() {
if len(eb.groupBy) == 0 {
eb.mu.Lock()
eb.prevSketch = eb.sketch
eb.sketch = eb.newSketch()
eb.mu.Unlock()
return
}
eb.mu.Lock()
eb.prevGroups = eb.groups
eb.groups = make(map[string]groupSketch, len(eb.groups))
eb.mu.Unlock()
eb.groupSize.Add(int64(len(eb.prevGroups)))
}
func (eb *estimatorBucket) insert(ts protoparser.TimeSerie, groupValuesKey string, groupValues []string) {
eb.mu.Lock()
defer eb.mu.Unlock()
if len(eb.groupBy) == 0 {
eb.sketch.InsertHash(ts.Fingerprint)
return
}
gsk, ok := eb.groups[groupValuesKey]
if !ok {
if _, ok := eb.prevGroups[groupValuesKey]; !ok {
groupSize := eb.groupSize.Load()
if groupSize+1 > eb.groupLimit {
eb.groupRejectedMu.Lock()
eb.groupRejectedSketch.InsertHash(hash([]byte(groupValuesKey)))
eb.groupRejectedMu.Unlock()
return
}
eb.groupSize.Add(1)
}
formatBuf := make([]byte, 0, 1024)
formatBuf = strconv.AppendQuote(formatBuf, groupValuesKey)
for i := range groupValues {
formatBuf = append(formatBuf, ',')
if eb.groupBy[i] == `__name__` {
formatBuf = append(formatBuf, `by__name__`...)
} else {
formatBuf = append(formatBuf, `by_`...)
formatBuf = append(formatBuf, eb.groupBy[i]...)
}
formatBuf = append(formatBuf, '=')
formatBuf = strconv.AppendQuote(formatBuf, groupValues[i])
}
formatBuf = append(formatBuf, `} `...)
gsk = groupSketch{
groupValueLabels: bytesutil.ToUnsafeString(formatBuf),
Sketch: eb.newSketch(),
}
eb.groups[strings.Clone(groupValuesKey)] = gsk
}
gsk.InsertHash(ts.Fingerprint)
}
func (eb *estimatorBucket) writeNoGroupMetric(res *hyperloglog.Sketch) {
eb.mu.Lock()
defer eb.mu.Unlock()
eb.mergeSketches(eb.sketch, eb.prevSketch, res)
}
func (eb *estimatorBucket) writeGroupMetrics(w io.Writer, res *hyperloglog.Sketch, formatBuf []byte) []byte {
eb.mu.Lock()
defer eb.mu.Unlock()
prefixLen := len(formatBuf)
for valuesKey, gsk := range eb.groups {
res.Reset()
formatBuf = formatBuf[:prefixLen]
formatBuf = append(formatBuf, gsk.groupValueLabels...)
eb.mergeSketches(gsk.Sketch, eb.prevGroups[valuesKey].Sketch, res)
formatBuf = strconv.AppendUint(formatBuf, res.Estimate(), 10)
formatBuf = append(formatBuf, "\n"...)
if _, err := w.Write(formatBuf); err != nil {
logger.Errorf("writing metrics failed: %s; written cardinality metrics might be incomplete or invalid", err)
}
}
for valuesKey := range eb.prevGroups {
if _, ok := eb.groups[valuesKey]; ok {
continue
}
res.Reset()
formatBuf = formatBuf[:prefixLen]
gsk := eb.prevGroups[valuesKey]
formatBuf = append(formatBuf, gsk.groupValueLabels...)
eb.mergeSketches(nil, eb.prevGroups[valuesKey].Sketch, res)
formatBuf = strconv.AppendUint(formatBuf, res.Estimate(), 10)
formatBuf = append(formatBuf, "\n"...)
if _, err := w.Write(formatBuf); err != nil {
logger.Errorf("writing metrics failed: %s; written cardinality metrics might be incomplete or invalid", err)
}
}
return formatBuf[:prefixLen]
}
func (eb *estimatorBucket) mergeSketches(cur, prev, res *hyperloglog.Sketch) {
if err := res.Merge(cur); err != nil {
panic(err)
}
if prev != nil {
if err := res.Merge(prev); err != nil {
panic(err)
}
}
}
func (eb *estimatorBucket) newSketch() *hyperloglog.Sketch {
return mustNewSketch(eb.precision, eb.sparse)
}
type groupSketch struct {
groupValueLabels string
*hyperloglog.Sketch
}
func mustNewGroupRejectSketch() *hyperloglog.Sketch {
return mustNewSketch(10, true)
}
func mustNewSketch(precision uint8, sparse bool) *hyperloglog.Sketch {
sk, err := hyperloglog.NewSketch(precision, sparse)
if err != nil {
panic(fmt.Sprintf("cannot create HLL sketch with precision=%d and sparse=%v: %s", precision, sparse, err))
}
return sk
}
func hash(v []byte) uint64 {
return metro.Hash64(v, 1337)
}

View File

@@ -1,274 +0,0 @@
package main
import (
"fmt"
"io"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmestimator/protoparser"
)
func BenchmarkEstimator_WriteMetrics(b *testing.B) {
b.Run("NoGroup/NoPrev", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{Interval: time.Hour})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
insertSeriesIntoEstimator(e, 5_000, 0)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
e.writeMetrics(io.Discard)
}
})
b.Run("NoGroup/WithPrev", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{Interval: time.Hour})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
insertSeriesIntoEstimator(e, 5_000, 0)
for _, eb := range e.buckets {
eb.rotate()
}
insertSeriesIntoEstimator(e, 5_000, 0)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
e.writeMetrics(io.Discard)
}
})
b.Run("Group100/NoPrev", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
insertSeriesIntoEstimator(e, 5_000, 100)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
e.writeMetrics(io.Discard)
}
})
b.Run("Group100/WithPrev", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
insertSeriesIntoEstimator(e, 5_000, 100)
for _, eb := range e.buckets {
eb.rotate()
}
insertSeriesIntoEstimator(e, 5_000, 100)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
e.writeMetrics(io.Discard)
}
})
b.Run("Group10k/NoPrev", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
insertSeriesIntoEstimator(e, 50_000, 10_000)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
e.writeMetrics(io.Discard)
}
})
b.Run("Group10k/WithPrev", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
insertSeriesIntoEstimator(e, 50_000, 10_000)
for _, eb := range e.buckets {
eb.rotate()
}
insertSeriesIntoEstimator(e, 50_000, 10_000)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
e.writeMetrics(io.Discard)
}
})
}
func BenchmarkEstimator_InsertManyParallel(b *testing.B) {
b.Run("NoGroup", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{Interval: time.Hour})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
b.ResetTimer()
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var i uint64
for pb.Next() {
e.insertMany([]protoparser.TimeSerie{{Fingerprint: i}})
i++
}
})
})
b.Run("Group100", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
b.ResetTimer()
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var i uint64
for pb.Next() {
e.insertMany([]protoparser.TimeSerie{{
GroupLabels: []protoparser.Label{{Name: "groupLabel", Value: fmt.Sprintf("%d", i%100)}},
Fingerprint: i,
}})
i++
}
})
})
b.Run("Group10k", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
b.ResetTimer()
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var i uint64
for pb.Next() {
e.insertMany([]protoparser.TimeSerie{{
GroupLabels: []protoparser.Label{{Name: "groupLabel", Value: fmt.Sprintf("%d", i%10_000)}},
Fingerprint: i,
}})
i++
}
})
})
b.Run("Group100k", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{
GroupBy: []string{"groupLabel"},
Interval: time.Hour,
})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
b.ResetTimer()
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var i uint64
for pb.Next() {
e.insertMany([]protoparser.TimeSerie{{
GroupLabels: []protoparser.Label{{Name: "groupLabel", Value: fmt.Sprintf("%d", i%100_000)}},
Fingerprint: i,
}})
i++
}
})
})
}
// BenchmarkEstimator_InsertRotateCycle benchmarks the insert→rotate→insert cycle
// for the global (no-group) estimator in two HLL regimes:
// - Sparse: 1 000 series per interval (sketch stays in sparse mode)
// - Normal: 30 000 series per interval (sketch converts to dense mode)
func BenchmarkEstimator_InsertRotateCycle(b *testing.B) {
b.Run("SparseHLL", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{Interval: time.Hour})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
insertSeriesIntoEstimator(e, 1_000, 0)
e.rotate()
}
})
b.Run("NormalHLL", func(b *testing.B) {
e, err := newEstimator(EstimatorConfig{Interval: time.Hour})
if err != nil {
b.Fatalf("newEstimator: %v", err)
}
defer e.stop()
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
insertSeriesIntoEstimator(e, 30_000, 0)
e.rotate()
}
})
}
// insertSeriesIntoEstimator inserts numSeries time series into e.
// When groupsNum > 0 each series gets a "groupLabel" cycling through groupsNum values.
func insertSeriesIntoEstimator(e *estimator, numSeries, groupsNum int) {
for i := 0; i < numSeries; i++ {
var labels []protoparser.Label
if groupsNum > 0 {
labels = append(labels, protoparser.Label{
Name: "groupLabel",
Value: fmt.Sprintf("%d", i%groupsNum),
})
}
e.insertMany([]protoparser.TimeSerie{
{
GroupLabels: labels,
Fingerprint: hash([]byte(fmt.Sprintf("foobarbaz%d", i))),
},
})
}
}

View File

@@ -1,595 +0,0 @@
package main
import (
"bytes"
"encoding/binary"
"fmt"
"sort"
"strings"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmestimator/protoparser"
)
func TestGlobalEstimate(t *testing.T) {
genCard := func(cardinality int, seed string) func(e *estimator) {
return func(e *estimator) {
var tss []protoparser.TimeSerie
fpBuf := make([]byte, 8, 8+len(seed))
for i := 0; i < cardinality; i++ {
binary.LittleEndian.PutUint64(fpBuf[:8], uint64(i))
fpBuf = append(fpBuf, seed...)
tss = append(tss, protoparser.TimeSerie{
Fingerprint: hash(fpBuf[:]),
})
if i%10 == 0 {
e.insertMany(tss)
tss = tss[:0]
}
}
if len(tss) > 0 {
e.insertMany(tss)
}
}
}
f := func(gen func(e *estimator), expMetric string) {
t.Helper()
cfg := EstimatorConfig{
Interval: time.Minute * 10,
Buckets: 5,
}
e, err := newEstimator(cfg)
if err != nil {
t.Fatalf("failed to create new estimator: %v", err)
}
defer e.stop()
gen(e)
if len(e.buckets) != cfg.Buckets {
t.Fatalf("expected buckets length to be %d but got %d", cfg.Buckets, len(e.buckets))
}
for i, eb := range e.buckets {
if len(eb.groupBy) > 0 {
t.Fatalf("expected bucket %d groupBy length to be 0 but got %d", i, len(eb.groupBy))
}
if eb.groups != nil {
t.Fatalf("expected bucket %d groups length to be 0 but got %d", i, len(eb.groups))
}
if eb.groupSize.Load() != 0 {
t.Fatalf("expected bucket %d groupSize to be 0 but got %d", i, eb.groupSize.Load())
}
}
buf := bytes.NewBuffer(nil)
e.writeMetrics(buf)
if strings.TrimSpace(buf.String()) != expMetric {
t.Fatalf("\nexpected:\n%s\n\ngot:\n%s", expMetric, buf.String())
}
}
// no previous
f(genCard(0, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genCard(1, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 1`)
f(genCard(10, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 10`)
f(genCard(100, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 100`)
f(genCard(1000, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 1000`)
f(genCard(5000, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 4998`)
f(genCard(10000, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 9920`)
f(genCard(100000, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 99658`)
f(genCard(500000, ""), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 496552`)
// rotate once
genRotateOnce := func(cardinality int) func(e *estimator) {
return func(e *estimator) {
genCard(cardinality, "")(e)
e.rotate()
}
}
f(genRotateOnce(0), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateOnce(1), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 1`)
f(genRotateOnce(10), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 10`)
f(genRotateOnce(100), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 100`)
f(genRotateOnce(1000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 1000`)
f(genRotateOnce(5000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 4998`)
f(genRotateOnce(10000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 9920`)
f(genRotateOnce(100000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 99658`)
f(genRotateOnce(500000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 496552`)
// insert, rotate insert the same
genInsertRotateInsertSameOnce := func(cardinality int) func(e *estimator) {
return func(e *estimator) {
genCard(cardinality/2, "")(e)
e.rotate()
genCard(cardinality/2, "")(e)
}
}
f(genInsertRotateInsertSameOnce(0), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genInsertRotateInsertSameOnce(1), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genInsertRotateInsertSameOnce(10), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 5`)
f(genInsertRotateInsertSameOnce(100), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 50`)
f(genInsertRotateInsertSameOnce(1000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 500`)
f(genInsertRotateInsertSameOnce(5000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 2499`)
f(genInsertRotateInsertSameOnce(10000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 4998`)
f(genInsertRotateInsertSameOnce(100000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 49529`)
f(genInsertRotateInsertSameOnce(200000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 99658`)
// insert, rotate insert
genInsertRotateInsertOnce := func(cardinality int) func(e *estimator) {
return func(e *estimator) {
genCard(cardinality/2, "one")(e)
e.rotate()
genCard(cardinality/2, "two")(e)
}
}
f(genInsertRotateInsertOnce(0), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genInsertRotateInsertOnce(1), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genInsertRotateInsertOnce(10), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 10`)
f(genInsertRotateInsertOnce(100), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 100`)
f(genInsertRotateInsertOnce(1000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 1000`)
f(genInsertRotateInsertOnce(5000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 5000`)
f(genInsertRotateInsertOnce(10000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 10058`)
f(genInsertRotateInsertOnce(100000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 99543`)
f(genInsertRotateInsertOnce(200000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 198814`)
// insert, rotate insert
genRotateTwoTimes := func(cardinality int) func(e *estimator) {
return func(e *estimator) {
genCard(cardinality, "")(e)
e.rotate()
e.rotate()
}
}
f(genRotateTwoTimes(0), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(1), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(10), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(100), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(1000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(5000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(10000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(100000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
f(genRotateTwoTimes(500000), `cardinality_estimate{interval="10m0s",group_by_keys="__global__"} 0`)
}
func TestGroupEstimate(t *testing.T) {
genCard := func(fooCard, barCard, bazCard int, seed string) func(e *estimator) {
return func(e *estimator) {
var tss []protoparser.TimeSerie
for fooI := 0; fooI < max(1, fooCard); fooI++ {
for barI := 0; barI < max(1, barCard); barI++ {
for bazI := 0; bazI < max(1, bazCard); bazI++ {
ts := protoparser.TimeSerie{}
ts.GroupLabels = append(ts.GroupLabels, protoparser.Label{Name: "__name__", Value: "the_metric_name"})
if fooCard > 0 {
ts.GroupLabels = append(ts.GroupLabels, protoparser.Label{Name: "foo", Value: fmt.Sprintf("%s%d", seed, fooI)})
}
if barCard > 0 {
ts.GroupLabels = append(ts.GroupLabels, protoparser.Label{Name: "bar", Value: fmt.Sprintf("%s%d", seed, barI)})
}
if bazCard > 0 {
ts.GroupLabels = append(ts.GroupLabels, protoparser.Label{Name: "baz", Value: fmt.Sprintf("%s%d", seed, bazI)})
}
var fpBuf []byte
for _, l := range ts.GroupLabels {
fpBuf = append(fpBuf, l.Name...)
fpBuf = append(fpBuf, '=')
fpBuf = append(fpBuf, l.Value...)
fpBuf = append(fpBuf, ',')
}
fpBuf = append(fpBuf, seed...)
ts.Fingerprint = hash(fpBuf)
tss = append(tss, ts)
}
}
}
e.insertMany(tss)
}
}
f := func(groupBy []string, gen func(e *estimator), expMetrics string) {
t.Helper()
cfg := EstimatorConfig{
Interval: time.Minute * 10,
GroupBy: groupBy,
Buckets: 5,
}
e, err := newEstimator(cfg)
if err != nil {
t.Fatalf("failed to create new estimator: %v", err)
}
defer e.stop()
gen(e)
if len(e.buckets) != cfg.Buckets {
t.Fatalf("expected buckets length to be %d but got %d", cfg.Buckets, len(e.buckets))
}
for i, eb := range e.buckets {
if eb.sketch != nil {
t.Fatalf("expected bucket %d sketch to be nil", i)
}
if eb.prevSketch != nil {
t.Fatalf("expected bucket %d prevSketch to be nil", i)
}
}
buf := bytes.NewBuffer(nil)
e.writeMetrics(buf)
lines := strings.Split(strings.TrimSpace(buf.String()), "\n")
sort.Strings(lines)
actMetrics := "\n" + strings.Join(lines, "\n")
if expMetrics != actMetrics {
t.Fatalf("\nexpected:\n%s\n\ngot:\n%s", expMetrics, actMetrics)
}
}
// group by metric name
f([]string{"__name__"}, genCard(10, 10, 10, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="__name__"} 1
cardinality_estimate{interval="10m0s",group_by_keys="__name__",group_by_values="the_metric_name",by__name__="the_metric_name"} 1000`,
)
// time series does not contribute to a group
f([]string{"foo"}, genCard(0, 10, 10, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 0`,
)
f([]string{"foo", "bar"}, genCard(0, 0, 10, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 0`,
)
// group by one label
f([]string{"foo"}, genCard(1, 1, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 1`,
)
f([]string{"foo"}, genCard(1, 2, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 2`,
)
f([]string{"foo"}, genCard(1, 10, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 10`,
)
f([]string{"foo"}, genCard(1, 100, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 100`,
)
f([]string{"foo"}, genCard(1, 1000, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 1000`,
)
f([]string{"foo"}, genCard(1, 10000, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 9957`,
)
f([]string{"foo"}, genCard(1, 50000, 0, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 50387`,
)
f([]string{"foo"}, genCard(1, 1, 1, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 1`,
)
f([]string{"foo"}, genCard(1, 2, 2, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 4`,
)
f([]string{"foo"}, genCard(1, 10, 10, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 100`,
)
f([]string{"foo"}, genCard(1, 100, 100, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 9954`,
)
f([]string{"foo"}, genCard(1, 1000, 1000, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 1013124`,
)
// group by one label, rotate
genCardRotate := func(fooCard, barCard, bazCard int, seed string) func(e *estimator) {
return func(e *estimator) {
genCard(fooCard, barCard, bazCard, seed)(e)
e.rotate()
}
}
f([]string{"foo"}, genCardRotate(1, 10, 10, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 100`,
)
f([]string{"foo"}, genCardRotate(1, 1000, 1000, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 1013124`,
)
// group by one label, rotate, insert same
genCardRotateInsertSame := func(barCard, bazCard int) func(e *estimator) {
return func(e *estimator) {
genCard(1, barCard, bazCard, "")(e)
e.rotate()
genCard(1, barCard, bazCard, "")(e)
}
}
f([]string{"foo"}, genCardRotateInsertSame(10, 10), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 100`,
)
f([]string{"foo"}, genCardRotateInsertSame(1000, 1000), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="0",by_foo="0"} 1013124`,
)
// group by one label, rotate, insert diff
genCardRotateInsertDiff := func(barCard, bazCard int) func(e *estimator) {
return func(e *estimator) {
genCard(1, barCard, bazCard, "one")(e)
e.rotate()
genCard(1, barCard, bazCard, "two")(e)
}
}
f([]string{"foo"}, genCardRotateInsertDiff(10, 10), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 2
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="one0",by_foo="one0"} 100
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="two0",by_foo="two0"} 100`,
)
f([]string{"foo"}, genCardRotateInsertDiff(1000, 1000), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 2
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="one0",by_foo="one0"} 995153
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="two0",by_foo="two0"} 992158`,
)
// group by one label, rotate, insert diff
genCardRotateTwice := func(barCard, bazCard int) func(e *estimator) {
return func(e *estimator) {
genCard(1, barCard, bazCard, "one")(e)
e.rotate()
e.rotate()
}
}
f([]string{"foo"}, genCardRotateTwice(10, 10), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 0`,
)
f([]string{"foo"}, genCardRotateTwice(1000, 1000), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 0`,
)
// group by two labels
f([]string{"foo", "bar"}, genCard(1, 1, 1000, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,0",by_foo="0",by_bar="0"} 1000`,
)
f([]string{"foo", "bar"}, genCard(2, 1, 1000, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 2
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,0",by_foo="0",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,0",by_foo="1",by_bar="0"} 1000`,
)
f([]string{"foo", "bar"}, genCard(2, 2, 1000, ""), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 4
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,0",by_foo="0",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,1",by_foo="0",by_bar="1"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,0",by_foo="1",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,1",by_foo="1",by_bar="1"} 1000`,
)
// group by two labels, rotate
genCardTwoLabelsRotate := func() func(e *estimator) {
return func(e *estimator) {
genCard(2, 2, 1000, "")(e)
e.rotate()
}
}
f([]string{"foo", "bar"}, genCardTwoLabelsRotate(), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 4
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,0",by_foo="0",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,1",by_foo="0",by_bar="1"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,0",by_foo="1",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,1",by_foo="1",by_bar="1"} 1000`,
)
// group by two labels, rotate, insert same
genCardTwoLabelsRotateInsertSame := func() func(e *estimator) {
return func(e *estimator) {
genCard(2, 2, 1000, "")(e)
e.rotate()
genCard(2, 2, 1000, "")(e)
}
}
f([]string{"foo", "bar"}, genCardTwoLabelsRotateInsertSame(), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 4
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,0",by_foo="0",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="0,1",by_foo="0",by_bar="1"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,0",by_foo="1",by_bar="0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="1,1",by_foo="1",by_bar="1"} 1000`,
)
// group by two labels, rotate, insert diff
genCardTwoLabelsRotateInsertDiff := func() func(e *estimator) {
return func(e *estimator) {
genCard(2, 2, 1000, "one")(e)
e.rotate()
genCard(2, 2, 1000, "two")(e)
}
}
f([]string{"foo", "bar"}, genCardTwoLabelsRotateInsertDiff(), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 8
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="one0,one0",by_foo="one0",by_bar="one0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="one0,one1",by_foo="one0",by_bar="one1"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="one1,one0",by_foo="one1",by_bar="one0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="one1,one1",by_foo="one1",by_bar="one1"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="two0,two0",by_foo="two0",by_bar="two0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="two0,two1",by_foo="two0",by_bar="two1"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="two1,two0",by_foo="two1",by_bar="two0"} 1000
cardinality_estimate{interval="10m0s",group_by_keys="foo,bar",group_by_values="two1,two1",by_foo="two1",by_bar="two1"} 1000`,
)
// group by two labels, rotate, insert diff
genCardTwoLabelsRotateTwice := func() func(e *estimator) {
return func(e *estimator) {
genCard(2, 2, 1000, "one")(e)
e.rotate()
e.rotate()
}
}
f([]string{"foo", "bar"}, genCardTwoLabelsRotateTwice(), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo,bar"} 0`,
)
// quote values: label values with special characters must be properly escaped
genSpecialCard := func(fooVal string) func(e *estimator) {
return func(e *estimator) {
e.insertMany([]protoparser.TimeSerie{
{
GroupLabels: []protoparser.Label{{Name: "foo", Value: fooVal}},
Fingerprint: hash([]byte("foo=" + fooVal + ",")),
},
})
}
}
// double quote in value
f([]string{"foo"}, genSpecialCard(`a"b`), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a\"b",by_foo="a\"b"} 1`,
)
f([]string{"foo"}, genSpecialCard(`a\b`), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a\\b",by_foo="a\\b"} 1`,
)
f([]string{"foo"}, genSpecialCard("a\nb"), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a\nb",by_foo="a\nb"} 1`,
)
f([]string{"foo"}, genSpecialCard("a\tb"), `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a\tb",by_foo="a\tb"} 1`,
)
}
func TestGroupEstimateGroupLimit(t *testing.T) {
makeTS := func(fooVal string) protoparser.TimeSerie {
return protoparser.TimeSerie{
GroupLabels: []protoparser.Label{{Name: "foo", Value: fooVal}},
Fingerprint: hash([]byte("foo=" + fooVal + ",")),
}
}
f := func(groupLimit int, gen func(e *estimator), expRejected int, expMetrics string) {
t.Helper()
cfg := EstimatorConfig{
Interval: time.Minute * 10,
GroupBy: []string{"foo"},
GroupLimit: groupLimit,
Buckets: 3,
}
e, err := newEstimator(cfg)
if err != nil {
t.Fatalf("failed to create new estimator: %v", err)
}
defer e.stop()
gen(e)
buf := bytes.NewBuffer(nil)
e.writeMetrics(buf)
lines := strings.Split(strings.TrimSpace(buf.String()), "\n")
sort.Strings(lines)
actMetrics := "\n" + strings.Join(lines, "\n")
if expMetrics != actMetrics {
t.Fatalf("\nexpected:\n%s\n\ngot:\n%s", expMetrics, actMetrics)
}
var actRejected int
if e.buckets[0].groupRejectedSketch != nil {
actRejected = int(e.buckets[0].groupRejectedSketch.Estimate())
}
if expRejected != actRejected {
t.Fatalf("rejected expected: %d; got: %d", expRejected, actRejected)
}
}
// all groups accepted
f(3, func(e *estimator) {
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("b"), makeTS("c")})
}, 0, `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 3
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a",by_foo="a"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="b",by_foo="b"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="c",by_foo="c"} 1`,
)
// 2 groups only accepted
f(2, func(e *estimator) {
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("b"), makeTS("c")})
}, 1, `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 3
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a",by_foo="a"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="b",by_foo="b"} 1`,
)
// one group only accepted
f(1, func(e *estimator) {
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("b"), makeTS("c")})
}, 2, `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 3
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a",by_foo="a"} 1`,
)
// after rotate: groups in prevGroups bypass the limit; new groups are still checked
f(2, func(e *estimator) {
// fills limit
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("b")})
e.rotate()
// "a" bypasses, "c" rejected
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("c")})
}, 1, `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 3
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a",by_foo="a"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="b",by_foo="b"} 1`,
)
// after rotate: new group accepted when remaining capacity allows
f(3, func(e *estimator) {
// 2 groups, limit=3
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("b")})
e.rotate()
// "a" bypasses, "c" accepted (2+1=3 <= 3)
e.insertMany([]protoparser.TimeSerie{makeTS("a"), makeTS("c")})
}, 0, `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 3
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a",by_foo="a"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="b",by_foo="b"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="c",by_foo="c"} 1`,
)
// reject 100
f(3, func(e *estimator) {
var tss []protoparser.TimeSerie
for i := 0; i < 103; i++ {
tss = append(tss, makeTS(fmt.Sprintf("a%d", i)))
}
e.insertMany(tss)
}, 100, `
cardinality_estimate{interval="10m0s",group_by_keys="__group__",group_by_values="foo"} 103
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a0",by_foo="a0"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a1",by_foo="a1"} 1
cardinality_estimate{interval="10m0s",group_by_keys="foo",group_by_values="a2",by_foo="a2"} 1`,
)
}

View File

@@ -1,123 +0,0 @@
package main
import (
"flag"
"io"
"net/http"
"os"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmestimator/protoparser"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
"github.com/VictoriaMetrics/metrics"
)
var (
httpListenAddrs = flagutil.NewArrayString("httpListenAddr", "TCP address to listen for incoming HTTP requests")
configPath = flag.String("config", "config.yaml", "Path to YAML configuration file")
prometheusWriteRequests = metrics.NewCounter(`vmestimator_http_requests_total{path="/api/v1/write", protocol="promremotewrite"}`)
)
func main() {
flag.CommandLine.SetOutput(os.Stdout)
envflag.Parse()
buildinfo.Init()
logger.Init()
cfg, err := loadConfig(*configPath)
if err != nil {
logger.Fatalf("cannot load config: %v", err)
}
estimators := make([]*estimator, 0, len(cfg.Streams))
for _, ec := range cfg.Streams {
e, err := newEstimator(ec)
if err != nil {
logger.Fatalf("cannot create estimator: %v", err)
}
estimators = append(estimators, e)
}
if *cardinalityMetricsExposeAt == `/metrics` {
metrics.RegisterMetricsWriter(func(w io.Writer) {
writeCardinalityMetrics(w, estimators)
})
}
groupLabelsMap := make(map[string]struct{})
for _, e := range estimators {
for _, l := range e.groupBy {
groupLabelsMap[l] = struct{}{}
}
}
groupLabels := make([]string, 0, len(groupLabelsMap))
for k := range groupLabelsMap {
groupLabels = append(groupLabels, k)
}
listenAddrs := *httpListenAddrs
if len(listenAddrs) == 0 {
listenAddrs = []string{":8490"}
}
logger.Infof("starting vmestimator at %q", listenAddrs)
startTime := time.Now()
go httpserver.Serve(listenAddrs, func(w http.ResponseWriter, r *http.Request) bool {
cmPath := *cardinalityMetricsExposeAt
if cmPath != "/metrics" && cmPath != "" && r.URL.Path == cmPath {
w.WriteHeader(http.StatusOK)
writeCardinalityMetrics(w, estimators)
return true
}
path, _ := strings.CutPrefix(r.URL.Path, `/cardinality`)
switch path {
case "/api/v1/write":
prometheusWriteRequests.Inc()
err := protoparser.Parse(r.Body, groupLabels, func(tss []protoparser.TimeSerie) {
for _, e := range estimators {
e.insertMany(tss)
}
})
if err != nil {
httpserver.Errorf(w, r, "error parsing remote write request: %s", err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/reset":
for _, e := range estimators {
e.reset()
}
w.WriteHeader(http.StatusOK)
return true
}
return false
}, httpserver.ServeOptions{})
logger.Infof("started vmestimator in %.3f seconds", time.Since(startTime).Seconds())
pushmetrics.Init()
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
pushmetrics.Stop()
logger.Infof("gracefully shutting down webservice at %q", listenAddrs)
if err := httpserver.Stop(listenAddrs); err != nil {
logger.Errorf("cannot stop http server: %s", err)
}
for _, e := range estimators {
e.stop()
}
logger.Infof("shutting down vmestimator")
}

View File

@@ -1,78 +0,0 @@
package protoparser
import (
"fmt"
"io"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/protoparserutil"
"github.com/VictoriaMetrics/metrics"
)
var maxInsertRequestSize = flagutil.NewBytes("maxInsertRequestSize", 32*1024*1024, "The maximum size in bytes of a single Prometheus remote_write API request")
// Parse parses Prometheus remote_write message from reader and calls callback for the parsed timeseries.
//
// callback shouldn't hold tss after returning.
func Parse(r io.Reader, groupLabels []string, callback func(tss []TimeSerie)) error {
startTime := fasttime.UnixTimestamp()
readCalls.Inc()
err := protoparserutil.ReadUncompressedData(r, "", maxInsertRequestSize, func(data []byte) error {
return parseRequestBody(data, groupLabels, callback)
})
if err != nil {
readErrors.Inc()
return fmt.Errorf("cannot read prometheus remote_write data from client in %d seconds: %w", fasttime.UnixTimestamp()-startTime, err)
}
return nil
}
func parseRequestBody(data []byte, groupLabels []string, callback func(tss []TimeSerie)) error {
// Synchronously process the request in order to properly return errors to Parse caller,
// so it could properly return HTTP 503 status code in response.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
bb := bodyBufferPool.Get()
defer bodyBufferPool.Put(bb)
if encoding.IsZstd(data) {
var err error
bb.B, err = encoding.DecompressZSTDLimited(bb.B[:0], data, maxInsertRequestSize.IntN())
if err != nil {
return fmt.Errorf("cannot decompress zstd-encoded request with length %d: %w", len(data), err)
}
} else {
var err error
bb.B, err = snappy.Decode(bb.B, data, maxInsertRequestSize.IntN())
if err != nil {
return fmt.Errorf("cannot decompress snappy-encoded request with length %d: %w", len(data), err)
}
}
if int64(len(bb.B)) > maxInsertRequestSize.N {
return fmt.Errorf("too big unpacked request; mustn't exceed `-maxInsertRequestSize=%d` bytes; got %d bytes", maxInsertRequestSize.N, len(bb.B))
}
wru := getWriteRequestUnmarshaler()
defer putWriteRequestUnmarshaler(wru)
if err := wru.UnmarshalProtobuf(bb.B, groupLabels, func(tss []TimeSerie) {
rowsRead.Add(len(tss))
callback(tss)
}); err != nil {
unmarshalErrors.Inc()
return fmt.Errorf("cannot unmarshal prompb.WriteRequest with size %d bytes: %w", len(bb.B), err)
}
return nil
}
var bodyBufferPool bytesutil.ByteBufferPool
var (
readCalls = metrics.NewCounter(`vm_protoparser_read_calls_total{type="promremotewrite"}`)
readErrors = metrics.NewCounter(`vm_protoparser_read_errors_total{type="promremotewrite"}`)
rowsRead = metrics.NewCounter(`vm_protoparser_rows_read_total{type="promremotewrite"}`)
unmarshalErrors = metrics.NewCounter(`vm_protoparser_unmarshal_errors_total{type="promremotewrite"}`)
)

View File

@@ -1,67 +0,0 @@
package protoparser
import (
"bytes"
"fmt"
"strings"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
"github.com/golang/snappy"
)
func BenchmarkParse(b *testing.B) {
data := buildSnappyEncodedWriteRequest(5000, 20, 20, 3)
groupLabels := []string{
"foo",
"bar",
"baz",
"__name__",
"job",
"groupLabel",
}
var cnt int
b.ResetTimer()
b.ReportAllocs()
b.SetBytes(int64(len(data)))
for b.Loop() {
err := Parse(bytes.NewReader(data), groupLabels, func(tss []TimeSerie) {
cnt += len(tss)
})
if err != nil {
b.Fatalf("stream.Parse: %v", err)
}
}
}
// buildSnappyEncodedWriteRequest builds a snappy-encoded protobuf WriteRequest
// with numSeries time series, each having numLabels labels of labelSize bytes each.
func buildSnappyEncodedWriteRequest(numSeries, numLabels, labelSize, groupsNum int) []byte {
labelValue := strings.Repeat("x", labelSize)
tss := make([]prompb.TimeSeries, numSeries)
for i := range tss {
labels := make([]prompb.Label, numLabels)
for j := range labels {
labels[j] = prompb.Label{
Name: fmt.Sprintf("label%02d", j),
Value: fmt.Sprintf("val%05d_%s", i, labelValue),
}
}
labels = append(labels, prompb.Label{
Name: "groupLabel",
Value: fmt.Sprintf("%d", i%groupsNum),
})
tss[i] = prompb.TimeSeries{
Labels: labels,
Samples: []prompb.Sample{{Value: 1, Timestamp: 1000}},
}
}
wr := &prompb.WriteRequest{Timeseries: tss}
pbData := wr.MarshalProtobuf(nil)
return snappy.Encode(nil, pbData)
}

View File

@@ -1,170 +0,0 @@
package protoparser
import (
"fmt"
"slices"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/easyproto"
"github.com/cespare/xxhash/v2"
)
type TimeSerie struct {
GroupLabels []Label
Fingerprint uint64
}
type Label struct {
Name string
Value string
}
func getWriteRequestUnmarshaler() *writeRequestUnmarshaler {
v := wruPool.Get()
if v == nil {
return &writeRequestUnmarshaler{
tss: make([]TimeSerie, 0, 1024),
labelsPool: make([]Label, 0, 4096),
d: xxhash.New(),
}
}
return v.(*writeRequestUnmarshaler)
}
func putWriteRequestUnmarshaler(wru *writeRequestUnmarshaler) {
wru.Reset()
wruPool.Put(wru)
}
var wruPool sync.Pool
// WriteRequestUnmarshaler is reusable unmarshaler for WriteRequest protobuf messages.
//
// It maintains internal pools for labels and samples to reduce memory allocations.
// See UnmarshalProtobuf for details on how to use it.
type writeRequestUnmarshaler struct {
tss []TimeSerie
labelsPool []Label
d *xxhash.Digest
}
// Reset resets wru, so it could be re-used.
func (wru *writeRequestUnmarshaler) Reset() {
wru.tss = wru.tss[:0]
wru.labelsPool = wru.labelsPool[:0]
wru.d.Reset()
}
func (wru *writeRequestUnmarshaler) UnmarshalProtobuf(src []byte, groupLabels []string, callback func(tss []TimeSerie)) error {
wru.Reset()
var err error
tss := wru.tss
// message WriteRequest {
// repeated TimeSeries timeseries = 1;
// reserved 2;
// repeated Metadata metadata = 3;
// }
labelsPool := wru.labelsPool
var fc easyproto.FieldContext
for len(src) > 0 {
if len(tss) >= cap(tss) {
callback(tss)
tss = tss[:0]
labelsPool = labelsPool[:0]
}
src, err = fc.NextField(src)
if err != nil {
return fmt.Errorf("cannot read the next field: %w", err)
}
switch fc.FieldNum {
case 1:
data, ok := fc.MessageData()
if !ok {
return fmt.Errorf("cannot read timeseries data")
}
tss = tss[:len(tss)+1]
ts := &tss[len(tss)-1]
d := wru.d
d.Reset()
labelsPool, err = ts.unmarshalProtobuf(data, groupLabels, labelsPool, d)
if err != nil {
return fmt.Errorf("cannot unmarshal timeseries: %w", err)
}
}
}
if len(tss) > 0 {
callback(tss)
tss = tss[:0]
labelsPool = labelsPool[:0]
}
wru.tss = tss[:0]
wru.labelsPool = labelsPool
wru.d.Reset()
return nil
}
func (ts *TimeSerie) unmarshalProtobuf(src []byte, groupLabels []string, labelsPool []Label, d *xxhash.Digest) ([]Label, error) {
// message TimeSeries {
// repeated Label labels = 1;
// repeated Sample samples = 2;
// }
labelsPoolLen := len(labelsPool)
var fc easyproto.FieldContext
var lfc easyproto.FieldContext
for len(src) > 0 {
var err error
src, err = fc.NextField(src)
if err != nil {
return labelsPool, fmt.Errorf("cannot read the next field: %w", err)
}
switch fc.FieldNum {
case 1:
data, ok := fc.MessageData()
if !ok {
return labelsPool, fmt.Errorf("cannot read label data")
}
var nameBytes, valueBytes []byte
ldata := data
for len(ldata) > 0 {
ldata, err = lfc.NextField(ldata)
if err != nil {
return labelsPool, fmt.Errorf("cannot read label field: %w", err)
}
switch lfc.FieldNum {
case 1:
nameBytes, ok = lfc.Bytes()
if !ok {
return labelsPool, fmt.Errorf("cannot read label name")
}
case 2:
valueBytes, ok = lfc.Bytes()
if !ok {
return labelsPool, fmt.Errorf("cannot read label value")
}
}
}
_, _ = d.Write(data)
name := bytesutil.ToUnsafeString(nameBytes)
if slices.Contains(groupLabels, name) {
labelsPool = append(labelsPool, Label{
Name: name,
Value: bytesutil.ToUnsafeString(valueBytes),
})
}
}
}
ts.GroupLabels = labelsPool[labelsPoolLen:]
ts.Fingerprint = d.Sum64()
return labelsPool, nil
}

View File

@@ -1,86 +0,0 @@
package protoparser
import (
"fmt"
"strings"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
)
func BenchmarkWriteRequest_UnmarshalProtobuf(b *testing.B) {
var data = make([]byte, 0, 21_000_000)
f := func(rows, labels, labelSize, groupBy int) {
bName := fmt.Sprintf("Rows=%d/Labels=%d/LabelSize=%d/GroupBy=%d", rows, labels, labelSize, groupBy)
b.Run(bName, func(b *testing.B) {
data := buildEncodedWriteRequest(data, rows, labels, labelSize, groupBy)
groupLabels := []string{
"foo",
"bar",
"baz",
"__name__",
"job",
"groupLabel",
}
wru := getWriteRequestUnmarshaler()
cnt := 0
b.ResetTimer()
b.ReportAllocs()
b.SetBytes(int64(len(data)))
for b.Loop() {
wru.Reset()
if err := wru.UnmarshalProtobuf(data, groupLabels, func(tss []TimeSerie) {
cnt += len(tss)
}); err != nil {
b.Fatalf("unexpected error: %s", err)
}
}
})
}
f(5_000, 0, 0, 3)
f(5_000, 1, 20, 3)
f(1_000, 20, 20, 3)
f(5_000, 20, 20, 3)
f(10_000, 20, 20, 3)
f(20_000, 20, 20, 3)
// long label values
f(1_000, 20, 2000, 3)
// many labels
f(1_000, 2000, 100, 3)
}
// buildEncodedWriteRequest builds a snappy-encoded protobuf WriteRequest
// with numSeries time series, each having numLabels labels of labelSize bytes each.
func buildEncodedWriteRequest(dst []byte, numSeries, numLabels, labelSize, groupsNum int) []byte {
labelValue := strings.Repeat("x", labelSize)
tss := make([]prompb.TimeSeries, numSeries)
for i := range tss {
labels := make([]prompb.Label, numLabels)
for j := range labels {
labels[j] = prompb.Label{
Name: fmt.Sprintf("label%02d", j),
Value: fmt.Sprintf("val%05d_%s", i, labelValue),
}
}
labels = append(labels, prompb.Label{
Name: "groupLabel",
Value: fmt.Sprintf("%d", i%groupsNum),
})
tss[i] = prompb.TimeSeries{
Labels: labels,
Samples: []prompb.Sample{{Value: 1, Timestamp: 1000}},
}
}
wr := &prompb.WriteRequest{Timeseries: tss}
return wr.MarshalProtobuf(dst[:0])
}

View File

@@ -59,7 +59,7 @@ services:
- '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr": },{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]'
restart: always
vmanomaly:
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
depends_on:
- "victoriametrics"
ports:

View File

@@ -14,6 +14,13 @@ aliases:
---
Please find the changelog for VictoriaMetrics Anomaly Detection below.
## v1.29.6
Released: 2026-06-17
- BUGFIX: Fixed `VLogsReader` startup and query execution when `tenant_id` is omitted or provided in short account-only form such as `"0"`. Omitted or empty tenant IDs are treated as single-node/no-tenant mode, and account-only tenant IDs are expanded to `accountID:0` before adding VictoriaLogs `AccountID`/`ProjectID` params or VM tenant labels.
- BUGFIX: Hardened [`OnlineMADModel`](https://docs.victoriametrics.com/anomaly-detection/components/models/#online-mad) anomaly scoring for perfectly constant time series (all values identical). The model now keeps a small deterministic prediction interval when the learned MAD is zero, so values deviating from an unknown constant baseline can produce `anomaly_score > 1` (previously, all anomaly scores were `0`).
## v1.29.5
Released: 2026-06-11

View File

@@ -423,7 +423,7 @@ services:
# ...
vmanomaly:
container_name: vmanomaly
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
# ...
restart: always
volumes:
@@ -641,7 +641,7 @@ options:
Heres an example of using the config splitter to divide configurations based on the `extra_filters` argument from the reader section:
```sh
docker pull victoriametrics/vmanomaly:v1.29.5 && docker image tag victoriametrics/vmanomaly:v1.29.5 vmanomaly
docker pull victoriametrics/vmanomaly:v1.29.6 && docker image tag victoriametrics/vmanomaly:v1.29.6 vmanomaly
```
```sh

View File

@@ -45,7 +45,7 @@ There are 2 types of compatibility to consider when migrating in stateful mode:
| Group start | Group end | Compatibility | Notes |
|---------|--------- |------------|-------|
| [v1.29.1](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1291) | [v1.29.5](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1295) | Fully Compatible | - |
| [v1.29.1](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1291) | [v1.29.6](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1296) | Fully Compatible | - |
| [v1.28.7](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1287) | [v1.29.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1290) | Partially compatible* | Dumped models of class [prophet](https://docs.victoriametrics.com/anomaly-detection/components/models/#prophet) and [seasonal quantile](https://docs.victoriametrics.com/anomaly-detection/components/models/#online-seasonal-quantile) have problems with loading to [v1.29.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1290) due to dropped `pytz` library. **Upgrading directly from v1.28.7 to [v1.29.1](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1291) with a fix is suggested** |
| [v1.26.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1262) | [v1.28.7](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1287) | Fully Compatible | [v1.28.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1280) introduced [rolling](https://docs.victoriametrics.com/anomaly-detection/components/models/#rolling-models) model class drop in favor of [online](https://docs.victoriametrics.com/anomaly-detection/components/models/#online-models) models (`rolling_quantile` and `std` models), however, it does not impact compatibility, as artifacts were not produced by default for rolling models. Also, offline `mad` and `zscore` models are redirecting to their respective online counterparts since [v1.28.4](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1284). |
| [v1.25.3](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1253) | [v1.26.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1270) | Partially Compatible* | [v1.25.3](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1253) introduced `forecast_at` argument for base [univariate](https://docs.victoriametrics.com/anomaly-detection/components/models/#univariate-models) and `Prophet` [models](https://docs.victoriametrics.com/anomaly-detection/components/models/#prophet), however, itself remains backward-reversible from newer states like [v1.26.2](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1262), [v1.27.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1270). (All models except `isolation_forest_multivariate` class will be dropped) |

View File

@@ -132,7 +132,7 @@ Below are the steps to get `vmanomaly` up and running inside a Docker container:
1. Pull Docker image:
```sh
docker pull victoriametrics/vmanomaly:v1.29.5
docker pull victoriametrics/vmanomaly:v1.29.6
```
2. Create the license file with your license key.
@@ -152,7 +152,7 @@ docker run -it \
-v ./license:/license \
-v ./config.yaml:/config.yaml \
-p 8490:8490 \
victoriametrics/vmanomaly:v1.29.5 \
victoriametrics/vmanomaly:v1.29.6 \
/config.yaml \
--licenseFile=/license \
--loggerLevel=INFO \
@@ -169,7 +169,7 @@ docker run -it \
-e VMANOMALY_DATA_DUMPS_DIR=/tmp/vmanomaly/data \
-e VMANOMALY_MODEL_DUMPS_DIR=/tmp/vmanomaly/models \
-p 8490:8490 \
victoriametrics/vmanomaly:v1.29.5 \
victoriametrics/vmanomaly:v1.29.6 \
/config.yaml \
--licenseFile=/license \
--loggerLevel=INFO \
@@ -182,7 +182,7 @@ services:
# ...
vmanomaly:
container_name: vmanomaly
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
# ...
restart: always
volumes:

View File

@@ -315,7 +315,7 @@ docker run -it --rm \
-e VMANOMALY_MCP_SERVER_URL=http://mcp-vmanomaly:8081/mcp \
-p 8080:8080 \
-p 8490:8490 \
victoriametrics/vmanomaly:v1.29.5 \
victoriametrics/vmanomaly:v1.29.6 \
vmanomaly_config.yaml
```

View File

@@ -1265,7 +1265,7 @@ monitoring:
Let's pull the docker image for `vmanomaly`:
```sh
docker pull victoriametrics/vmanomaly:v1.29.5
docker pull victoriametrics/vmanomaly:v1.29.6
```
Now we can run the docker container putting as volumes both config and model file:
@@ -1279,7 +1279,7 @@ docker run -it \
-v $(PWD)/license:/license \
-v $(PWD)/custom_model.py:/vmanomaly/model/custom.py \
-v $(PWD)/custom.yaml:/config.yaml \
victoriametrics/vmanomaly:v1.29.5 /config.yaml \
victoriametrics/vmanomaly:v1.29.6 /config.yaml \
--licenseFile=/license
--watch
```

View File

@@ -395,7 +395,7 @@ services:
restart: always
vmanomaly:
container_name: vmanomaly
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
depends_on:
- "victoriametrics"
ports:

View File

@@ -1,100 +0,0 @@
---
weight: 3
menu:
docs:
parent: victoriametrics
weight: 13
title: vmestimator
tags:
- metrics
- cardinality
aliases:
- /vmestimator.html
- /vmestimator/index.html
- /vmestimator/
---
`vmestimator` is a cardinality estimator that receives Prometheus remote write streams
and exposes approximate time series cardinality as metrics (TODO: support remote write).
It is useful for tracking how many unique time series are flowing through across all metrics, metric name, or broken down by specific labels.
## How it works
Running:
```
go run ./app/vmestimator/... -config=streams.yaml -httpListenAddr=:8490
```
Configuration:
```yaml
streams:
# Track total cardinality with no grouping.
- interval: '1h'
# Track cardinality grouped by metric name.
- interval: '1h'
group_by: ["__name__"]
# Track cardinality grouped by job label.
- interval: '1m'
group_by: ["job"]
# Track cardinality grouped by tenant info
- group_by: ["vm_account_id", "vm_project_id"]
# Track cardinality of jobs, with extra labels on the output metrics.
- group_by: ["job"]
labels:
region: 'eu-central-1'
env: 'production'
```
Fields:
- `group_by` (optional): list of label names to split cardinality by; each distinct combination gets its own estimate
- `group_limit` (optional): maximum number of distinct groups to track; excess groups are counted in a rejected sketch but not individually; defaults to `10000`
- `buckets` (optional): number of internal shards for parallel ingestion; defaults to `min(64, 2*availableCPUs)`
- `labels` (optional): extra labels attached to all output metrics for this estimator
- `interval` (optional): how often to rotate (reset) counters; defaults to `5m`
- `hll_precision` (optional): HyperLogLog precision, must be in range `[4, 18]`; higher values yield more accurate estimates at the cost of more memory; defaults to `14`
- `hll_sparse` (optional): whether to use sparse HyperLogLog representation, which reduces memory for low-cardinality groups; defaults to `true`
## Metrics
By default, cardinality estimates are merged with regular metrics and exposed at `/metrics`.
This behavior is controlled by the following flags:
- `-cardinalityMetrics.cacheTTL` (default `30s`): how long to cache the cardinality metrics response before recomputing it
The HTTP endpoint is controlled by the `-cardinalityMetrics.exposeAt` flag:
- `-cardinalityMetrics.exposeAt=/metrics` (default): cardinality metrics merged with regular metrics at `/metrics`
- `-cardinalityMetrics.exposeAt=/cardinality/metrics`: only cardinality metrics exposed at that path
- `-cardinalityMetrics.exposeAt=`: cardinality metrics not exposed via HTTP
All metrics include `interval`, `group_by_keys`, and `group_by_values` labels. Extra labels from the `labels` config field are inserted between `interval` and `group_by_keys` (sorted alphabetically).
**Without grouping** (`group_by_keys` is `__global__` and `group_by_values` is not set):
```
cardinality_estimate{interval="1h0m0s",group_by_keys="__global__"} 142300
```
**With grouping** — one summary line (total distinct group count) plus one line per distinct label value combination. Each per-group line also includes individual `by_{key}="{val}"` labels for each group key:
```
cardinality_estimate{interval="5m0s",group_by_keys="__group__",group_by_values="instance,job"} 2
cardinality_estimate{interval="5m0s",group_by_keys="instance,job",group_by_values="host1:9090,prometheus",by_instance="host1:9090",by_job="prometheus"} 312
cardinality_estimate{interval="5m0s",group_by_keys="instance,job",group_by_values="host2:9100,node",by_instance="host2:9100",by_job="node"} 87
```
**With extra labels:**
```
cardinality_estimate{interval="5m0s",env="production",region="eu-central-1",group_by_keys="job",group_by_values="prometheus",by_job="prometheus"} 312
```
## Operational metrics
When grouping is enabled, vmestimator exposes per-bucket operational metrics at `/metrics`:
- `vmestimator_estimator_group_size{group_by_keys, bucket}` — number of active groups in this bucket after the last rotation
- `vmestimator_estimator_group_rejected_size{group_by_keys}` — estimated number of distinct group values rejected since the last rotation because `group_limit` was reached
- `vmestimator_estimator_group_limit{group_by_keys, bucket}` — configured `group_limit` for this bucket

5
go.mod
View File

@@ -2,8 +2,6 @@ module github.com/VictoriaMetrics/VictoriaMetrics
go 1.26.4
replace github.com/axiomhq/hyperloglog => github.com/makasim/hyperloglog v0.0.10-reuse-memory
require (
cloud.google.com/go/storage v1.62.3
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.22.0
@@ -18,11 +16,9 @@ require (
github.com/aws/aws-sdk-go-v2/config v1.32.25
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.22.27
github.com/aws/aws-sdk-go-v2/service/s3 v1.103.3
github.com/axiomhq/hyperloglog v0.0.0-00010101000000-000000000000
github.com/bmatcuk/doublestar/v4 v4.10.0
github.com/cespare/xxhash/v2 v2.3.0
github.com/cheggaaa/pb/v3 v3.1.7
github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33
github.com/gogo/protobuf v1.3.2
github.com/golang/snappy v1.0.0
github.com/google/go-cmp v0.7.0
@@ -101,7 +97,6 @@ require (
github.com/hashicorp/go-version v1.9.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kamstrup/intmap v0.5.2 // indirect
github.com/knadh/koanf/maps v0.1.2 // indirect
github.com/knadh/koanf/providers/confmap v1.0.0 // indirect
github.com/knadh/koanf/v2 v2.3.5 // indirect

8
go.sum
View File

@@ -52,6 +52,8 @@ github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapp
github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.57.0/go.mod h1:YqwkQPrWSC7+byyc1VlKbWLBF5JsW5IoL6xUkemYSXk=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/VictoriaMetrics/VictoriaLogs v1.50.1-0.20260415124154-6b7a6357aec0 h1:2x1Tszv41PnCdSMumEtejz/On1RQ45kHQ+hhKT53sOk=
github.com/VictoriaMetrics/VictoriaLogs v1.50.1-0.20260415124154-6b7a6357aec0/go.mod h1:fQtmzaSUL+HJmHozeAKmnTJTOMBT+vBccv/VWQEwhUQ=
github.com/VictoriaMetrics/VictoriaLogs v1.121.1-0.20260616132739-c901a1e31cb3 h1:3eP8RRZitbga5EYiQ3IANrMPxpBwMAX4VA6akDaXwpU=
github.com/VictoriaMetrics/VictoriaLogs v1.121.1-0.20260616132739-c901a1e31cb3/go.mod h1:H4sDxcvk6OmC6zOt++IlDyrwfbn4F1eSLwMpR+kpRt8=
github.com/VictoriaMetrics/easyproto v1.2.0 h1:FJT9uNXA2isppFuJErbLqD306KoFlehl7Wn2dg/6oIE=
@@ -150,8 +152,6 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE=
github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA=
github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33 h1:ucRHb6/lvW/+mTEIGbvhcYU3S8+uSNkuMjx/qZFfhtM=
github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
github.com/digitalocean/godo v1.193.0 h1:CSbbUl5LufT75KPNvex3vDnBYjY2RfJWs7T3Ac7dHpA=
github.com/digitalocean/godo v1.193.0/go.mod h1:xQsWpVCCbkDrWisHA72hPzPlnC+4W5w/McZY5ij9uvU=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
@@ -294,8 +294,6 @@ github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2E
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kamstrup/intmap v0.5.2 h1:qnwBm1mh4XAnW9W9Ue9tZtTff8pS6+s6iKF6JRIV2Dk=
github.com/kamstrup/intmap v0.5.2/go.mod h1:gWUVWHKzWj8xpJVFf5GC0O26bWmv3GqdnIX/LMT6Aq4=
github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU=
github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
@@ -318,8 +316,6 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/linode/linodego v1.69.1 h1:f45N2MHR/oece2/ktTTCYmrlfse4//k3NgwcF5zbGZ0=
github.com/linode/linodego v1.69.1/go.mod h1:Fha0NYsQSx5VZK1HQNJY/z/dIxxkFp+vb5veawbmAUw=
github.com/makasim/hyperloglog v0.0.10-reuse-memory h1:tqMXSDlkVujI/aGYUm6uwt4lRUQcne22MOLcJBgLAGc=
github.com/makasim/hyperloglog v0.0.10-reuse-memory/go.mod h1:YjX/dQqCR/7QYX0g8mu8UZAjpIenz1FKM71UEsjFoTo=
github.com/mattn/go-colorable v0.1.15 h1:+u9SLTRGnXv73cEsnsmoZBom+dMU88B2M0aDcWy0/jY=
github.com/mattn/go-colorable v0.1.15/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw4=

View File

@@ -1,16 +0,0 @@
# Binaries for programs and plugins
*.exe
*.dll
*.so
*.dylib
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
.glide/
/vendor

View File

@@ -1,41 +0,0 @@
## How to Contribute
👍🎉 First of all, thank you for your interest in Axiom-node! We'd love to accept your patches and contributions! 🎉👍
This project accepts contributions. In order to contribute, you should pay attention to a few guidelines:
## Reporting Issues
Bugs, feature requests, and development-related questions should be directed to our [GitHub issue tracker](https://github.com/axiomhq/hyperloglog/issues).
When reporting a bug, please try and provide as much context as possible such as your operating system, Go version and anything else that might be relevant to the bug. For feature requests, please explain what you're trying to do and how the requested feature would help you do that.
## Setup
[Fork](https://github.com/axiomhq/hyperloglog.git), then clone this repository:
```
git clone https://github.com/axiomhq/hyperloglog.git
cd hyperloglog
cd demo
go run hyperloglog_demo.go
```
## Submitting Modifications
1. It's generally best to start by opening a new issue describing the bug or feature you're intending to fix. Even if you think it's relatively minor, it's helpful to know what people are working on. Mention in the initial issue that you are planning to work on that bug or feature so that it can be assigned to you.
2. Follow the normal process of [forking](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) the project, and setup a new branch to work in. It's important that each group of changes be done in separate branches in order to ensure that a pull request only includes the commits related to that bug or feature.
3. Go makes it very simple to ensure properly formatted code, so always run `go fmt` on your code before committing it.
4. Do your best to have [well-formated commit messages](https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html)
for each change. This provides consistency throughout the project and ensures that commit messages are able to be formatted properly by various git tools.
5. Finally, push the commits to your fork and submit a [pull request](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request)
### Once you've filed the PR:
- One or more maintainers will use GitHub's review feature to review your PR.
- If the maintainer asks for any changes, edit your changes, push, and ask for another review.
- If the maintainer decides to suggest some improvements or alternatives, modify and make improvements. Once your changes are approved, one of the project maintainers will merge them.

View File

@@ -1,19 +0,0 @@
Copyright (c) 2021, Axiom, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,51 +0,0 @@
# HyperLogLog - an algorithm for approximating the number of distinct elements
[![GoDoc](https://godoc.org/github.com/axiomhq/hyperloglog?status.svg)](https://godoc.org/github.com/axiomhq/hyperloglog) [![Go Report Card](https://goreportcard.com/badge/github.com/axiomhq/hyperloglog)](https://goreportcard.com/report/github.com/axiomhq/hyperloglog) [![CircleCI](https://circleci.com/gh/axiomhq/hyperloglog/tree/master.svg?style=svg)](https://circleci.com/gh/axiomhq/hyperloglog/tree/master)
An improved version of [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) for the count-distinct problem, approximating the number of distinct elements in a multiset. This implementation offers enhanced performance, flexibility, and simplicity while maintaining accuracy.
## Note on Implementation History
The initial version of this work (tagged as v0.1.0) was based on ["Better with fewer bits: Improving the performance of cardinality estimation of large data streams - Qingjun Xiao, You Zhou, Shigang Chen"](https://www.cise.ufl.edu/~sgchen/Publications/XZC17.pdf). However, the current implementation has evolved significantly from this original basis, notably moving away from the tailcut method.
## Current Implementation
The current implementation is based on the LogLog-Beta algorithm, as described in:
["LogLog-Beta and More: A New Algorithm for Cardinality Estimation Based on LogLog Counting"](https://arxiv.org/pdf/1612.02284) by Jason Qin, Denys Kim, and Yumei Tung (2016).
Key features of the current implementation:
* **Metro hash** used instead of xxhash
* **Sparse representation** for lower cardinalities (like HyperLogLog++)
* **LogLog-Beta** for dynamic bias correction across all cardinalities
* **8-bit registers** for convenience and simplified implementation
* **Order-independent insertions and merging** for consistent results regardless of data input order
* **Removal of tailcut method** for a more straightforward approach
* **Flexible precision** allowing for 2^4 to 2^18 registers
This implementation is now more straightforward, efficient, and flexible, while remaining backwards compatible with previous versions. It provides a balance between precision, memory usage, speed, and ease of use.
## Precision and Memory Usage
This implementation allows for creating HyperLogLog sketches with arbitrary precision between 2^4 and 2^18 registers. The memory usage scales with the number of registers:
* Minimum (2^4 registers): 16 bytes
* Default (2^14 registers): 16 KB
* Maximum (2^18 registers): 256 KB
Users can choose the precision that best fits their use case, balancing memory usage against estimation accuracy.
## Note
A big thank you to Prof. Shigang Chen and his team at the University of Florida who are actively conducting research around "Big Network Data".
## Contributing
Kindly check our [contributing guide](https://github.com/axiomhq/hyperloglog/blob/main/Contributing.md) on how to propose bugfixes and improvements, and submitting pull requests to the project
## License
&copy; Axiom, Inc., 2024
Distributed under MIT License (`The MIT License`).
See [LICENSE](LICENSE) for more information.

View File

@@ -1,273 +0,0 @@
package hyperloglog
import (
"fmt"
"math"
)
var betaMap = map[uint8]func(float64) float64{
4: beta4,
5: beta5,
6: beta6,
7: beta7,
8: beta8,
9: beta9,
10: beta10,
11: beta11,
12: beta12,
13: beta13,
14: beta14,
15: beta15,
16: beta16,
17: beta17,
18: beta18,
}
func beta(p uint8, ez float64) float64 {
f, ok := betaMap[p]
if !ok {
panic(fmt.Sprintf("invalid precision %d", p))
}
return f(ez)
}
/*
p=4
[-0.582581413904517,-1.935300357560050,11.07932375 8035073,-22.131357446444323,22.505391846630037,-12 .000723834917984,3.220579408194167,-0.342225302271 235]
*/
func beta4(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.582581413904517*ez +
-1.935300357560050*zl +
11.079323758035073*math.Pow(zl, 2) +
-22.131357446444323*math.Pow(zl, 3) +
22.505391846630037*math.Pow(zl, 4) +
-12.000723834917984*math.Pow(zl, 5) +
3.220579408194167*math.Pow(zl, 6) +
-0.342225302271235*math.Pow(zl, 7)
}
/*
p=5
[-0.7518999460733967,-0.9590030077748760,5.5997371 322141607,-8.2097636999765520,6.5091254894472037,- 2.6830293734323729,0.5612891113138221,-0.046333162 2196545]
*/
func beta5(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.7518999460733967*ez +
-0.9590030077748760*zl +
5.5997371322141607*math.Pow(zl, 2) +
-8.2097636999765520*math.Pow(zl, 3) +
6.5091254894472037*math.Pow(zl, 4) +
-2.6830293734323729*math.Pow(zl, 5) +
0.5612891113138221*math.Pow(zl, 6) +
-0.0463331622196545*math.Pow(zl, 7)
}
/*
p=6
[29.8257900969619634,-31.3287083337725925,-10.5942 523036582283,-11.5720125689099618,3.81887543739074 92,-2.4160130328530811,0.4542208940970826,-0.05751 55452020420]
*/
func beta6(ez float64) float64 {
zl := math.Log(ez + 1)
return 29.8257900969619634*ez +
-31.3287083337725925*zl +
-10.5942523036582283*math.Pow(zl, 2) +
-11.5720125689099618*math.Pow(zl, 3) +
3.8188754373907492*math.Pow(zl, 4) +
-2.4160130328530811*math.Pow(zl, 5) +
0.4542208940970826*math.Pow(zl, 6) +
-0.0575155452020420*math.Pow(zl, 7)
}
/*
p=7
[2.8102921290820060,-3.9780498518175995,1.31626800 41351582,-3.9252486335805901,2.0080835753946471,-0 .7527151937556955,0.1265569894242751,-0.0109946438726240]
*/
func beta7(ez float64) float64 {
zl := math.Log(ez + 1)
return 2.8102921290820060*ez +
-3.9780498518175995*zl +
1.3162680041351582*math.Pow(zl, 2) +
-3.9252486335805901*math.Pow(zl, 3) +
2.0080835753946471*math.Pow(zl, 4) +
-0.7527151937556955*math.Pow(zl, 5) +
0.1265569894242751*math.Pow(zl, 6) +
-0.0109946438726240*math.Pow(zl, 7)
}
/*
p=8
[1.00633544887550519,-2.00580666405112407,1.643697 49366514117,-2.70560809940566172,1.392099802442225 98,-0.46470374272183190,0.07384282377269775,-0.00578554885254223]
*/
func beta8(ez float64) float64 {
zl := math.Log(ez + 1)
return 1.00633544887550519*ez +
-2.00580666405112407*zl +
1.64369749366514117*math.Pow(zl, 2) +
-2.70560809940566172*math.Pow(zl, 3) +
1.39209980244222598*math.Pow(zl, 4) +
-0.46470374272183190*math.Pow(zl, 5) +
0.07384282377269775*math.Pow(zl, 6) +
-0.00578554885254223*math.Pow(zl, 7)
}
/*
p=9
[-0.09415657458167959,-0.78130975924550528,1.71514 946750712460,-1.73711250406516338,0.86441508489048 924,-0.23819027465047218,0.03343448400269076,-0.00 207858528178157]
*/
func beta9(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.09415657458167959*ez +
-0.78130975924550528*zl +
1.71514946750712460*math.Pow(zl, 2) +
-1.73711250406516338*math.Pow(zl, 3) +
0.86441508489048924*math.Pow(zl, 4) +
-0.23819027465047218*math.Pow(zl, 5) +
0.03343448400269076*math.Pow(zl, 6) +
-0.00207858528178157*math.Pow(zl, 7)
}
/*
p=10
[-0.25935400670790054,-0.52598301999805808,1.48933 034925876839,-1.29642714084993571,0.62284756217221615,-0.15672326770251041,0.02054415903878563,-0.00 112488483925502]
*/
func beta10(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.25935400670790054*ez +
-0.52598301999805808*zl +
1.48933034925876839*math.Pow(zl, 2) +
-1.29642714084993571*math.Pow(zl, 3) +
0.62284756217221615*math.Pow(zl, 4) +
-0.15672326770251041*math.Pow(zl, 5) +
0.02054415903878563*math.Pow(zl, 6) +
-0.00112488483925502*math.Pow(zl, 7)
}
/*
p=11
[-4.32325553856025e-01,-1.08450736399632e-01,6.091 56550741120e-01,-1.65687801845180e-02,-7.958293410 87617e-02,4.71830602102918e-02,-7.81372902346934e- 03,5.84268708489995e-04]
*/
func beta11(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.432325553856025*ez +
-0.108450736399632*zl +
0.609156550741120*math.Pow(zl, 2) +
-0.0165687801845180*math.Pow(zl, 3) +
-0.0795829341087617*math.Pow(zl, 4) +
0.0471830602102918*math.Pow(zl, 5) +
-0.00781372902346934*math.Pow(zl, 6) +
0.000584268708489995*math.Pow(zl, 7)
}
/*
p=12
[-3.84979202588598e-01,1.83162233114364e-01,1.3039 6688841854e-01,7.04838927629266e-02,-8.95893971464 453e-03,1.13010036741605e-02,-1.94285569591290e-03 ,2.25435774024964e-04]
*/
func beta12(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.384979202588598*ez +
0.183162233114364*zl +
0.130396688841854*math.Pow(zl, 2) +
0.0704838927629266*math.Pow(zl, 3) +
-0.0089589397146453*math.Pow(zl, 4) +
0.0113010036741605*math.Pow(zl, 5) +
-0.00194285569591290*math.Pow(zl, 6) +
0.000225435774024964*math.Pow(zl, 7)
}
/*
p=13
[-0.41655270946462997,-0.22146677040685156,0.38862 131236999947,0.45340979746062371,-0.36264738324476 375,0.12304650053558529,-0.01701540384555510,0.001 02750367080838]
*/
func beta13(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.41655270946462997*ez +
-0.22146677040685156*zl +
0.38862131236999947*math.Pow(zl, 2) +
0.45340979746062371*math.Pow(zl, 3) +
-0.36264738324476375*math.Pow(zl, 4) +
0.12304650053558529*math.Pow(zl, 5) +
-0.01701540384555510*math.Pow(zl, 6) +
0.00102750367080838*math.Pow(zl, 7)
}
/*
p=14
[-3.71009760230692e-01,9.78811941207509e-03,1.8579 6293324165e-01,2.03015527328432e-01,-1.16710521803 686e-01,4.31106699492820e-02,-5.99583540511831e-03 ,4.49704299509437e-04]
*/
func beta14(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.371009760230692*ez +
0.00978811941207509*zl +
0.185796293324165*math.Pow(zl, 2) +
0.203015527328432*math.Pow(zl, 3) +
-0.116710521803686*math.Pow(zl, 4) +
0.0431106699492820*math.Pow(zl, 5) +
-0.00599583540511831*math.Pow(zl, 6) +
0.000449704299509437*math.Pow(zl, 7)
}
/*
p=15
[-0.38215145543875273,-0.89069400536090837,0.37602 335774678869,0.99335977440682377,-0.65577441638318 956,0.18332342129703610,-0.02241529633062872,0.001 21399789330194]
*/
func beta15(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.38215145543875273*ez +
-0.89069400536090837*zl +
0.37602335774678869*math.Pow(zl, 2) +
0.99335977440682377*math.Pow(zl, 3) +
-0.65577441638318956*math.Pow(zl, 4) +
0.18332342129703610*math.Pow(zl, 5) +
-0.02241529633062872*math.Pow(zl, 6) +
0.00121399789330194*math.Pow(zl, 7)
}
/*
p=16
[-0.37331876643753059,-1.41704077448122989,0.407291 84796612533,1.56152033906584164,-0.99242233534286128,0.26064681399483092,-0.03053811369682807,0.00155770210179105]
*/
func beta16(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.37331876643753059*ez +
-1.41704077448122989*zl +
0.40729184796612533*math.Pow(zl, 2) +
1.56152033906584164*math.Pow(zl, 3) +
-0.99242233534286128*math.Pow(zl, 4) +
0.26064681399483092*math.Pow(zl, 5) +
-0.03053811369682807*math.Pow(zl, 6) +
0.00155770210179105*math.Pow(zl, 7)
}
/*
p=17
[-0.36775502299404605,0.53831422351377967,0.769702 89278767923,0.55002583586450560,-0.745755882611469 41,0.25711835785821952,-0.03437902606864149,0.0018 5949146371616]
*/
func beta17(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.36775502299404605*ez +
0.53831422351377967*zl +
0.76970289278767923*math.Pow(zl, 2) +
0.55002583586450560*math.Pow(zl, 3) +
-0.74575588261146941*math.Pow(zl, 4) +
0.25711835785821952*math.Pow(zl, 5) +
-0.03437902606864149*math.Pow(zl, 6) +
0.00185949146371616*math.Pow(zl, 7)
}
/*
p=18
[-0.36479623325960542,0.99730412328635032,1.553543 86230081221,1.25932677198028919,-1.533259482091101 63,0.47801042200056593,-0.05951025172951174,0.0029 1076804642205]
*/
func beta18(ez float64) float64 {
zl := math.Log(ez + 1)
return -0.36479623325960542*ez +
0.99730412328635032*zl +
1.55354386230081221*math.Pow(zl, 2) +
1.25932677198028919*math.Pow(zl, 3) +
-1.53325948209110163*math.Pow(zl, 4) +
0.47801042200056593*math.Pow(zl, 5) +
-0.05951025172951174*math.Pow(zl, 6) +
0.00291076804642205*math.Pow(zl, 7)
}

View File

@@ -1,176 +0,0 @@
package hyperloglog
import (
"encoding/binary"
"slices"
)
// Original author of this file is github.com/clarkduvall/hyperloglog
type iterator struct {
i int
last uint32
v *compressedList
}
func (iter *iterator) Next() uint32 {
n, i := iter.v.decode(iter.i, iter.last)
iter.last = n
iter.i = i
return n
}
func (iter *iterator) Peek() (uint32, int) {
return iter.v.decode(iter.i, iter.last)
}
func (iter *iterator) Advance(last uint32, i int) {
iter.last = last
iter.i = i
}
func (iter iterator) HasNext() bool {
return iter.i < iter.v.Len()
}
type compressedList struct {
count uint32
last uint32
b variableLengthList
}
func (v *compressedList) Clone() *compressedList {
if v == nil {
return nil
}
newV := &compressedList{
count: v.count,
last: v.last,
}
newV.b = make(variableLengthList, len(v.b))
copy(newV.b, v.b)
return newV
}
func (v *compressedList) reset() {
if v == nil {
return
}
v.count = 0
v.last = 0
v.b = v.b[:0]
}
func (v *compressedList) AppendBinary(data []byte) ([]byte, error) {
// At least 4 bytes for the two fixed sized values
data = slices.Grow(data, 4+4)
// Marshal the count and last values.
data = append(data,
// Number of items in the list.
byte(v.count>>24),
byte(v.count>>16),
byte(v.count>>8),
byte(v.count),
// The last item in the list.
byte(v.last>>24),
byte(v.last>>16),
byte(v.last>>8),
byte(v.last),
)
// Append the variableLengthList
return v.b.AppendBinary(data)
}
func (v *compressedList) UnmarshalBinary(data []byte) error {
if len(data) < 12 {
return ErrorTooShort
}
// Set the count.
v.count, data = binary.BigEndian.Uint32(data[:4]), data[4:]
// Set the last value.
v.last, data = binary.BigEndian.Uint32(data[:4]), data[4:]
// Set the list.
sz, data := binary.BigEndian.Uint32(data[:4]), data[4:]
v.b = make([]uint8, sz)
if uint32(len(data)) < sz {
return ErrorTooShort
}
for i := uint32(0); i < sz; i++ {
v.b[i] = data[i]
}
return nil
}
func newCompressedList(capacity int) *compressedList {
v := &compressedList{}
v.b = make(variableLengthList, 0, capacity)
return v
}
func (v *compressedList) Len() int {
return len(v.b)
}
func (v *compressedList) decode(i int, last uint32) (uint32, int) {
n, i := v.b.decode(i)
return n + last, i
}
func (v *compressedList) Append(x uint32) {
v.count++
v.b = v.b.Append(x - v.last)
v.last = x
}
func (v *compressedList) Iter() iterator {
return iterator{0, 0, v}
}
type variableLengthList []uint8
func (v variableLengthList) AppendBinary(data []byte) ([]byte, error) {
// 4 bytes for the size of the list, and a byte for each element in the
// list.
data = slices.Grow(data, 4+len(v))
// Length of the list. We only need 32 bits because the size of the set
// couldn't exceed that on 32 bit architectures.
sz := len(v)
data = append(data,
byte(sz>>24),
byte(sz>>16),
byte(sz>>8),
byte(sz),
)
// Marshal each element in the list.
data = append(data, v...)
return data, nil
}
func (v variableLengthList) decode(i int) (uint32, int) {
var x uint32
j := i
for ; v[j]&0x80 != 0; j++ {
x |= uint32(v[j]&0x7f) << (uint(j-i) * 7)
}
x |= uint32(v[j]) << (uint(j-i) * 7)
return x, j + 1
}
func (v variableLengthList) Append(x uint32) variableLengthList {
for x&0xffffff80 != 0 {
v = append(v, uint8((x&0x7f)|0x80))
x >>= 7
}
return append(v, uint8(x&0x7f))
}

View File

@@ -1,439 +0,0 @@
package hyperloglog
import (
"encoding/binary"
"errors"
"fmt"
"math"
"slices"
"sync"
)
const (
pp = uint8(25)
mp = uint32(1) << pp
version = 2
)
type Sketch struct {
p uint8
m uint32
alpha float64
tmpSet set
sparseList *compressedList
regs []uint8
}
// New returns a HyperLogLog Sketch with 2^14 registers (precision 14)
func New() *Sketch { return New14() }
// New14 returns a HyperLogLog Sketch with 2^14 registers (precision 14)
func New14() *Sketch { return newSketchNoError(14, true) }
// New16 returns a HyperLogLog Sketch with 2^16 registers (precision 16)
func New16() *Sketch { return newSketchNoError(16, true) }
// NewNoSparse returns a HyperLogLog Sketch with 2^14 registers (precision 14) that will not use a sparse representation
func NewNoSparse() *Sketch { return newSketchNoError(14, false) }
// New16NoSparse returns a HyperLogLog Sketch with 2^16 registers (precision 16) that will not use a sparse representation
func New16NoSparse() *Sketch { return newSketchNoError(16, false) }
func newSketchNoError(precision uint8, sparse bool) *Sketch {
sk, _ := NewSketch(precision, sparse)
return sk
}
func NewSketch(precision uint8, sparse bool) (*Sketch, error) {
if precision < 4 || precision > 18 {
return nil, fmt.Errorf("p has to be >= 4 and <= 18")
}
m := uint32(1) << precision
s := &Sketch{
m: m,
p: precision,
alpha: alpha(float64(m)),
}
if sparse {
s.tmpSet = makeSet(0)
s.sparseList = getCompressedList(0)
} else {
s.regs = make([]uint8, m)
}
return s, nil
}
func (sk *Sketch) sparse() bool { return sk.sparseList != nil }
// Clone returns a deep copy of sk.
func (sk *Sketch) Clone() *Sketch {
clone := *sk
clone.regs = append([]uint8(nil), sk.regs...)
clone.tmpSet = sk.tmpSet.Clone()
clone.sparseList = sk.sparseList.Clone()
return &clone
}
func (sk *Sketch) Reset() {
if sk.sparse() {
sk.tmpSet.reset()
sk.sparseList.reset()
return
}
clear(sk.regs)
}
func (sk *Sketch) maybeToNormal() {
if uint32(sk.tmpSet.Len())*100 > sk.m {
sk.mergeSparse()
m := sk.m
if m > 8096 {
m -= m / 10
}
if uint32(sk.sparseList.Len()) > m {
sk.toNormal()
}
}
}
func (sk *Sketch) Merge(other *Sketch) error {
if other == nil {
return nil
}
if sk.p != other.p {
return errors.New("precisions must be equal")
}
if sk.sparse() && other.sparse() {
sk.mergeSparseSketch(other)
} else {
sk.mergeDenseSketch(other)
}
return nil
}
func (sk *Sketch) mergeSparseSketch(other *Sketch) {
sk.tmpSet.Merge(other.tmpSet)
for iter := other.sparseList.Iter(); iter.HasNext(); {
sk.tmpSet.add(iter.Next())
}
sk.maybeToNormal()
}
func (sk *Sketch) mergeDenseSketch(other *Sketch) {
if sk.sparse() {
sk.toNormal()
}
if other.sparse() {
other.tmpSet.ForEach(func(k uint32) {
i, r := decodeHash(k, other.p, pp)
sk.insert(i, r)
})
for iter := other.sparseList.Iter(); iter.HasNext(); {
i, r := decodeHash(iter.Next(), other.p, pp)
sk.insert(i, r)
}
} else {
for i, v := range other.regs {
if v > sk.regs[i] {
sk.regs[i] = v
}
}
}
}
func (sk *Sketch) toNormal() {
if sk.tmpSet.Len() > 0 {
sk.mergeSparse()
}
sk.regs = make([]uint8, sk.m)
for iter := sk.sparseList.Iter(); iter.HasNext(); {
i, r := decodeHash(iter.Next(), sk.p, pp)
sk.insert(i, r)
}
sk.tmpSet = nilSet
putCompressedList(sk.sparseList)
sk.sparseList = nil
}
func (sk *Sketch) insert(i uint32, r uint8) { sk.regs[i] = max(r, sk.regs[i]) }
func (sk *Sketch) Insert(e []byte) { sk.InsertHash(hash(e)) }
func (sk *Sketch) InsertHash(x uint64) {
if sk.sparse() {
if sk.tmpSet.add(encodeHash(x, sk.p, pp)) {
sk.maybeToNormal()
}
return
}
i, r := getPosVal(x, sk.p)
sk.insert(uint32(i), r)
}
func (sk *Sketch) Estimate() uint64 {
if sk.sparse() {
sk.mergeSparse()
return uint64(linearCount(mp, mp-sk.sparseList.count))
}
sum, ez := sumAndZeros(sk.regs)
m := float64(sk.m)
est := sk.alpha * m * (m - ez) / (sum + beta(sk.p, ez))
return uint64(est + 0.5)
}
var compressedListPools = newCompressedListPools()
func newCompressedListPools() [8]*sync.Pool {
pools := [8]*sync.Pool{}
for i := 0; i < len(pools); i++ {
pools[i] = &sync.Pool{}
}
return pools
}
func getCompressedList(requestedCapacity int) *compressedList {
var pool *sync.Pool
var capacity int
if capacity = 256; requestedCapacity < capacity {
pool = compressedListPools[0]
} else if capacity = 512; requestedCapacity < capacity {
pool = compressedListPools[1]
} else if capacity = 1024; requestedCapacity < capacity {
pool = compressedListPools[2]
} else if capacity = 2048; requestedCapacity < capacity {
pool = compressedListPools[3]
} else if capacity = 4096; requestedCapacity < capacity {
pool = compressedListPools[4]
} else if capacity = 8196; requestedCapacity < capacity {
pool = compressedListPools[5]
} else if capacity = 16384; requestedCapacity < capacity {
pool = compressedListPools[6]
} else {
capacity = requestedCapacity
pool = compressedListPools[7]
}
c := pool.Get()
if c == nil {
return newCompressedList(capacity - 1)
}
c1 := c.(*compressedList)
c1.b = slices.Grow(c1.b, capacity-1)
return c1
}
func putCompressedList(c *compressedList) {
c.reset()
capacity := cap(c.b)
if capacity < 256 {
compressedListPools[0].Put(c)
} else if capacity < 512 {
compressedListPools[1].Put(c)
} else if capacity < 1024 {
compressedListPools[2].Put(c)
} else if capacity < 2048 {
compressedListPools[3].Put(c)
} else if capacity < 4096 {
compressedListPools[4].Put(c)
} else if capacity < 8196 {
compressedListPools[5].Put(c)
} else if capacity < 16384 {
compressedListPools[6].Put(c)
} else {
compressedListPools[7].Put(c)
}
}
func (sk *Sketch) mergeSparse() {
if sk.tmpSet.Len() == 0 {
return
}
keys := make([]uint32, 0, sk.tmpSet.Len())
sk.tmpSet.ForEach(func(k uint32) {
keys = append(keys, k)
})
slices.Sort(keys)
newList := getCompressedList(4*sk.tmpSet.Len() + sk.sparseList.Len())
for iter, i := sk.sparseList.Iter(), 0; iter.HasNext() || i < len(keys); {
if !iter.HasNext() {
newList.Append(keys[i])
i++
continue
}
if i >= len(keys) {
newList.Append(iter.Next())
continue
}
x1, adv := iter.Peek()
x2 := keys[i]
if x1 == x2 {
newList.Append(x1)
iter.Advance(x1, adv)
i++
} else if x1 > x2 {
newList.Append(x2)
i++
} else {
newList.Append(x1)
iter.Advance(x1, adv)
}
}
putCompressedList(sk.sparseList)
sk.sparseList = newList
sk.tmpSet.m.Clear()
}
// MarshalBinary implements the encoding.BinaryMarshaler interface.
//
// When the result will be appended to another buffer, consider using
// AppendBinary to avoid additional allocations and copying.
func (sk *Sketch) MarshalBinary() (data []byte, err error) {
return sk.AppendBinary(nil)
}
// AppendBinary implements the encoding.BinaryAppender interface.
func (sk *Sketch) AppendBinary(data []byte) ([]byte, error) {
data = slices.Grow(data, 8+len(sk.regs))
// Marshal a version marker.
data = append(data, version)
// Marshal p.
data = append(data, sk.p)
// Marshal b
data = append(data, 0)
if sk.sparse() {
// It's using the sparse Sketch.
data = append(data, byte(1))
// Add the tmp_set
data, err := sk.tmpSet.AppendBinary(data)
if err != nil {
return nil, err
}
// Add the sparse Sketch
return sk.sparseList.AppendBinary(data)
}
// It's using the dense Sketch.
data = append(data, byte(0))
// Add the dense sketch Sketch.
sz := len(sk.regs)
data = append(data,
byte(sz>>24),
byte(sz>>16),
byte(sz>>8),
byte(sz),
)
// Marshal each element in the list.
for _, v := range sk.regs {
data = append(data, byte(v))
}
return data, nil
}
// ErrorTooShort is an error that UnmarshalBinary try to parse too short
// binary.
var ErrorTooShort = errors.New("too short binary")
// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
func (sk *Sketch) UnmarshalBinary(data []byte) error {
if len(data) < 8 {
return ErrorTooShort
}
// Unmarshal version. We may need this in the future if we make
// non-compatible changes.
v := data[0]
// Unmarshal p.
p := data[1]
// Unmarshal b.
b := data[2]
// Determine if we need a sparse Sketch
sparse := data[3] == byte(1)
// Make a newSketch Sketch if the precision doesn't match or if the Sketch was used
if sk.p != p || sk.regs != nil || sk.tmpSet.Len() > 0 || (sk.sparseList != nil && sk.sparseList.Len() > 0) {
newh, err := NewSketch(p, sparse)
if err != nil {
return err
}
*sk = *newh
}
// h is now initialised with the correct p. We just need to fill the
// rest of the details out.
if sparse {
// Using the sparse Sketch.
// Unmarshal the tmp_set.
tssz := binary.BigEndian.Uint32(data[4:8])
sk.tmpSet = makeSet(int(tssz))
// We need to unmarshal tssz values in total, and each value requires us
// to read 4 bytes.
tsLastByte := int((tssz * 4) + 8)
for i := 8; i < tsLastByte; i += 4 {
k := binary.BigEndian.Uint32(data[i : i+4])
sk.tmpSet.add(k)
}
// Unmarshal the sparse Sketch.
return sk.sparseList.UnmarshalBinary(data[tsLastByte:])
}
// Using the dense Sketch.
sk.sparseList = nil
sk.tmpSet = nilSet
if v == 1 {
return sk.unmarshalBinaryV1(data[8:], b)
}
return sk.unmarshalBinaryV2(data)
}
func sumAndZeros(regs []uint8) (res, ez float64) {
for _, v := range regs {
if v == 0 {
ez++
}
res += 1.0 / math.Pow(2.0, float64(v))
}
return res, ez
}
func (sk *Sketch) unmarshalBinaryV1(data []byte, b uint8) error {
sk.regs = make([]uint8, len(data)*2)
for i, v := range data {
sk.regs[i*2] = uint8((v >> 4)) + b
sk.regs[i*2+1] = uint8((v<<4)>>4) + b
}
return nil
}
func (sk *Sketch) unmarshalBinaryV2(data []byte) error {
sk.regs = data[8:]
return nil
}

View File

@@ -1,118 +0,0 @@
package hyperloglog
import (
"math/bits"
"slices"
"github.com/kamstrup/intmap"
)
func getIndex(k uint32, p, pp uint8) uint32 {
if k&1 == 1 {
return bextr32(k, 32-p, p)
}
return bextr32(k, pp-p+1, p)
}
// Encode a hash to be used in the sparse representation.
func encodeHash(x uint64, p, pp uint8) uint32 {
idx := uint32(bextr(x, 64-pp, pp))
if bextr(x, 64-pp, pp-p) == 0 {
zeros := bits.LeadingZeros64((bextr(x, 0, 64-pp)<<pp)|(1<<pp-1)) + 1
return idx<<7 | uint32(zeros<<1) | 1
}
return idx << 1
}
// Decode a hash from the sparse representation.
func decodeHash(k uint32, p, pp uint8) (uint32, uint8) {
var r uint8
if k&1 == 1 {
r = uint8(bextr32(k, 1, 6)) + pp - p
} else {
// We can use the 64bit clz implementation and reduce the result
// by 32 to get a clz for a 32bit word.
r = uint8(bits.LeadingZeros64(uint64(k<<(32-pp+p-1))) - 31) // -32 + 1
}
return getIndex(k, p, pp), r
}
type set struct {
m *intmap.Set[uint32]
}
func (s set) reset() {
if s.m != nil {
s.m.Clear()
}
}
var nilSet set
func makeSet(size int) set {
return set{m: intmap.NewSet[uint32](size)}
}
func (s set) ForEach(fn func(v uint32)) {
s.m.ForEach(func(v uint32) bool {
fn(v)
return true
})
}
func (s set) Merge(other set) {
other.m.ForEach(func(v uint32) bool {
s.m.Add(v)
return true
})
}
func (s set) Len() int {
return s.m.Len()
}
func (s set) add(v uint32) bool {
return s.m.Add(v)
}
func (s set) Clone() set {
if s == nilSet {
return nilSet
}
newS := intmap.NewSet[uint32](s.m.Len())
s.m.ForEach(func(v uint32) bool {
newS.Add(v)
return true
})
return set{m: newS}
}
func (s *set) AppendBinary(data []byte) ([]byte, error) {
// 4 bytes for the size of the set, and 4 bytes for each key.
// list.
data = slices.Grow(data, 4+(4*s.m.Len()))
// Length of the set. We only need 32 bits because the size of the set
// couldn't exceed that on 32 bit architectures.
sl := s.m.Len()
data = append(data,
byte(sl>>24),
byte(sl>>16),
byte(sl>>8),
byte(sl),
)
// Marshal each element in the set.
s.m.ForEach(func(k uint32) bool {
data = append(data,
byte(k>>24),
byte(k>>16),
byte(k>>8),
byte(k),
)
return true
})
return data, nil
}

View File

@@ -1,46 +0,0 @@
package hyperloglog
import (
"math"
"math/bits"
metro "github.com/dgryski/go-metro"
)
var hash = hashFunc
func alpha(m float64) float64 {
switch m {
case 16:
return 0.673
case 32:
return 0.697
case 64:
return 0.709
}
return 0.7213 / (1 + 1.079/m)
}
func getPosVal(x uint64, p uint8) (uint64, uint8) {
i := bextr(x, 64-p, p) // {x63,...,x64-p}
w := x<<p | 1<<(p-1) // {x63-p,...,x0}
rho := uint8(bits.LeadingZeros64(w)) + 1
return i, rho
}
func linearCount(m uint32, v uint32) float64 {
fm := float64(m)
return fm * math.Log(fm/float64(v))
}
func bextr(v uint64, start, length uint8) uint64 {
return (v >> start) & ((1 << length) - 1)
}
func bextr32(v uint32, start, length uint8) uint32 {
return (v >> start) & ((1 << length) - 1)
}
func hashFunc(e []byte) uint64 {
return metro.Hash64(e, 1337)
}

View File

@@ -1,24 +0,0 @@
This package is a mechanical translation of the reference C++ code for
MetroHash, available at https://github.com/jandrewrogers/MetroHash
The MIT License (MIT)
Copyright (c) 2016 Damian Gryski
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,6 +0,0 @@
MetroHash
This package is a mechanical translation of the reference C++ code for
MetroHash, available at https://github.com/jandrewrogers/MetroHash
I claim no additional copyright over the original implementation.

View File

@@ -1,94 +0,0 @@
package metro
import "encoding/binary"
func rotate_right(v uint64, k uint) uint64 {
return (v >> k) | (v << (64 - k))
}
func Hash128(buffer []byte, seed uint64) (uint64, uint64) {
const (
k0 = 0xC83A91E1
k1 = 0x8648DBDB
k2 = 0x7BDEC03B
k3 = 0x2F5870A5
)
ptr := buffer
var v [4]uint64
v[0] = (seed - k0) * k3
v[1] = (seed + k1) * k2
if len(ptr) >= 32 {
v[2] = (seed + k0) * k2
v[3] = (seed - k1) * k3
for len(ptr) >= 32 {
v[0] += binary.LittleEndian.Uint64(ptr) * k0
ptr = ptr[8:]
v[0] = rotate_right(v[0], 29) + v[2]
v[1] += binary.LittleEndian.Uint64(ptr) * k1
ptr = ptr[8:]
v[1] = rotate_right(v[1], 29) + v[3]
v[2] += binary.LittleEndian.Uint64(ptr) * k2
ptr = ptr[8:]
v[2] = rotate_right(v[2], 29) + v[0]
v[3] += binary.LittleEndian.Uint64(ptr) * k3
ptr = ptr[8:]
v[3] = rotate_right(v[3], 29) + v[1]
}
v[2] ^= rotate_right(((v[0]+v[3])*k0)+v[1], 21) * k1
v[3] ^= rotate_right(((v[1]+v[2])*k1)+v[0], 21) * k0
v[0] ^= rotate_right(((v[0]+v[2])*k0)+v[3], 21) * k1
v[1] ^= rotate_right(((v[1]+v[3])*k1)+v[2], 21) * k0
}
if len(ptr) >= 16 {
v[0] += binary.LittleEndian.Uint64(ptr) * k2
ptr = ptr[8:]
v[0] = rotate_right(v[0], 33) * k3
v[1] += binary.LittleEndian.Uint64(ptr) * k2
ptr = ptr[8:]
v[1] = rotate_right(v[1], 33) * k3
v[0] ^= rotate_right((v[0]*k2)+v[1], 45) * k1
v[1] ^= rotate_right((v[1]*k3)+v[0], 45) * k0
}
if len(ptr) >= 8 {
v[0] += binary.LittleEndian.Uint64(ptr) * k2
ptr = ptr[8:]
v[0] = rotate_right(v[0], 33) * k3
v[0] ^= rotate_right((v[0]*k2)+v[1], 27) * k1
}
if len(ptr) >= 4 {
v[1] += uint64(binary.LittleEndian.Uint32(ptr)) * k2
ptr = ptr[4:]
v[1] = rotate_right(v[1], 33) * k3
v[1] ^= rotate_right((v[1]*k3)+v[0], 46) * k0
}
if len(ptr) >= 2 {
v[0] += uint64(binary.LittleEndian.Uint16(ptr)) * k2
ptr = ptr[2:]
v[0] = rotate_right(v[0], 33) * k3
v[0] ^= rotate_right((v[0]*k2)+v[1], 22) * k1
}
if len(ptr) >= 1 {
v[1] += uint64(ptr[0]) * k2
v[1] = rotate_right(v[1], 33) * k3
v[1] ^= rotate_right((v[1]*k3)+v[0], 58) * k0
}
v[0] += rotate_right((v[0]*k0)+v[1], 13)
v[1] += rotate_right((v[1]*k1)+v[0], 37)
v[0] += rotate_right((v[0]*k2)+v[1], 13)
v[1] += rotate_right((v[1]*k3)+v[0], 37)
return v[0], v[1]
}

View File

@@ -1,89 +0,0 @@
//go:build noasm || !amd64 || !gc || purego
// +build noasm !amd64 !gc purego
package metro
import (
"encoding/binary"
"math/bits"
)
func Hash64(buffer []byte, seed uint64) uint64 {
const (
k0 = 0xD6D018F5
k1 = 0xA2AA033B
k2 = 0x62992FC1
k3 = 0x30BC5B29
)
ptr := buffer
hash := (seed + k2) * k0
if len(ptr) >= 32 {
v0, v1, v2, v3 := hash, hash, hash, hash
for len(ptr) >= 32 {
v0 += binary.LittleEndian.Uint64(ptr[:8]) * k0
v0 = bits.RotateLeft64(v0, -29) + v2
v1 += binary.LittleEndian.Uint64(ptr[8:16]) * k1
v1 = bits.RotateLeft64(v1, -29) + v3
v2 += binary.LittleEndian.Uint64(ptr[16:24]) * k2
v2 = bits.RotateLeft64(v2, -29) + v0
v3 += binary.LittleEndian.Uint64(ptr[24:32]) * k3
v3 = bits.RotateLeft64(v3, -29) + v1
ptr = ptr[32:]
}
v2 ^= bits.RotateLeft64(((v0+v3)*k0)+v1, -37) * k1
v3 ^= bits.RotateLeft64(((v1+v2)*k1)+v0, -37) * k0
v0 ^= bits.RotateLeft64(((v0+v2)*k0)+v3, -37) * k1
v1 ^= bits.RotateLeft64(((v1+v3)*k1)+v2, -37) * k0
hash += v0 ^ v1
}
if len(ptr) >= 16 {
v0 := hash + (binary.LittleEndian.Uint64(ptr[:8]) * k2)
v0 = bits.RotateLeft64(v0, -29) * k3
v1 := hash + (binary.LittleEndian.Uint64(ptr[8:16]) * k2)
v1 = bits.RotateLeft64(v1, -29) * k3
v0 ^= bits.RotateLeft64(v0*k0, -21) + v1
v1 ^= bits.RotateLeft64(v1*k3, -21) + v0
hash += v1
ptr = ptr[16:]
}
if len(ptr) >= 8 {
hash += binary.LittleEndian.Uint64(ptr[:8]) * k3
ptr = ptr[8:]
hash ^= bits.RotateLeft64(hash, -55) * k1
}
if len(ptr) >= 4 {
hash += uint64(binary.LittleEndian.Uint32(ptr[:4])) * k3
hash ^= bits.RotateLeft64(hash, -26) * k1
ptr = ptr[4:]
}
if len(ptr) >= 2 {
hash += uint64(binary.LittleEndian.Uint16(ptr[:2])) * k3
ptr = ptr[2:]
hash ^= bits.RotateLeft64(hash, -48) * k1
}
if len(ptr) >= 1 {
hash += uint64(ptr[0]) * k3
hash ^= bits.RotateLeft64(hash, -37) * k1
}
hash ^= bits.RotateLeft64(hash, -28)
hash *= k0
hash ^= bits.RotateLeft64(hash, -29)
return hash
}
func Hash64Str(buffer string, seed uint64) uint64 {
return Hash64([]byte(buffer), seed)
}

View File

@@ -1,387 +0,0 @@
// Code generated by command: go run asm.go -out metro_amd64.s -stubs metro_stub.go -pkg metro. DO NOT EDIT.
//go:build amd64 && gc && !purego && !noasm
#include "textflag.h"
// func Hash64(buffer []byte, seed uint64) uint64
TEXT ·Hash64(SB), NOSPLIT, $0-40
MOVQ seed+24(FP), AX
MOVQ buffer_base+0(FP), CX
MOVQ buffer_len+8(FP), DX
MOVQ $0xd6d018f5, BX
IMULQ BX, AX
MOVQ $0x52bc33fedbe4cbb5, BX
ADDQ BX, AX
CMPQ DX, $0x20
JLT after32
MOVQ AX, BX
MOVQ AX, SI
MOVQ AX, DI
MOVQ AX, R8
loop:
MOVQ (CX), R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
ADDQ R9, BX
RORQ $0x1d, BX
ADDQ DI, BX
MOVQ 8(CX), R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
ADDQ R9, SI
RORQ $0x1d, SI
ADDQ R8, SI
MOVQ 16(CX), R9
MOVQ $0x62992fc1, R10
IMULQ R10, R9
ADDQ R9, DI
RORQ $0x1d, DI
ADDQ BX, DI
MOVQ 24(CX), R9
MOVQ $0x30bc5b29, R10
IMULQ R10, R9
ADDQ R9, R8
RORQ $0x1d, R8
ADDQ SI, R8
ADDQ $0x20, CX
SUBQ $0x20, DX
CMPQ DX, $0x20
JGE loop
MOVQ BX, R9
ADDQ R8, R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
ADDQ SI, R9
RORQ $0x25, R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
XORQ R9, DI
MOVQ SI, R9
ADDQ DI, R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
ADDQ BX, R9
RORQ $0x25, R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
XORQ R9, R8
MOVQ BX, R9
ADDQ DI, R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
ADDQ R8, R9
RORQ $0x25, R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
XORQ R9, BX
MOVQ SI, R9
ADDQ R8, R9
MOVQ $0xa2aa033b, R8
IMULQ R8, R9
ADDQ DI, R9
RORQ $0x25, R9
MOVQ $0xd6d018f5, DI
IMULQ DI, R9
XORQ R9, SI
XORQ SI, BX
ADDQ BX, AX
after32:
CMPQ DX, $0x10
JLT after16
MOVQ (CX), BX
MOVQ $0x62992fc1, SI
IMULQ SI, BX
ADDQ AX, BX
ADDQ $0x08, CX
SUBQ $0x08, DX
RORQ $0x1d, BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
MOVQ (CX), SI
MOVQ $0x62992fc1, DI
IMULQ DI, SI
ADDQ AX, SI
ADDQ $0x08, CX
SUBQ $0x08, DX
RORQ $0x1d, SI
MOVQ $0x30bc5b29, DI
IMULQ DI, SI
MOVQ BX, DI
MOVQ $0xd6d018f5, R8
IMULQ R8, DI
RORQ $0x15, DI
ADDQ SI, DI
XORQ DI, BX
MOVQ SI, DI
MOVQ $0x30bc5b29, R8
IMULQ R8, DI
RORQ $0x15, DI
ADDQ BX, DI
XORQ DI, SI
ADDQ SI, AX
after16:
CMPQ DX, $0x08
JLT after8
MOVQ (CX), BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
ADDQ BX, AX
ADDQ $0x08, CX
SUBQ $0x08, DX
MOVQ AX, BX
RORQ $0x37, BX
MOVQ $0xa2aa033b, SI
IMULQ SI, BX
XORQ BX, AX
after8:
CMPQ DX, $0x04
JLT after4
XORQ BX, BX
MOVL (CX), BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
ADDQ BX, AX
ADDQ $0x04, CX
SUBQ $0x04, DX
MOVQ AX, BX
RORQ $0x1a, BX
MOVQ $0xa2aa033b, SI
IMULQ SI, BX
XORQ BX, AX
after4:
CMPQ DX, $0x02
JLT after2
XORQ BX, BX
MOVW (CX), BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
ADDQ BX, AX
ADDQ $0x02, CX
SUBQ $0x02, DX
MOVQ AX, BX
RORQ $0x30, BX
MOVQ $0xa2aa033b, SI
IMULQ SI, BX
XORQ BX, AX
after2:
CMPQ DX, $0x01
JLT after1
MOVBQZX (CX), CX
MOVQ $0x30bc5b29, DX
IMULQ DX, CX
ADDQ CX, AX
MOVQ AX, CX
RORQ $0x25, CX
MOVQ $0xa2aa033b, DX
IMULQ DX, CX
XORQ CX, AX
after1:
MOVQ AX, CX
RORQ $0x1c, CX
XORQ CX, AX
MOVQ $0xd6d018f5, CX
IMULQ CX, AX
MOVQ AX, CX
RORQ $0x1d, CX
XORQ CX, AX
MOVQ AX, ret+32(FP)
RET
// func Hash64Str(buffer string, seed uint64) uint64
TEXT ·Hash64Str(SB), NOSPLIT, $0-32
MOVQ seed+16(FP), AX
MOVQ buffer_base+0(FP), CX
MOVQ buffer_len+8(FP), DX
MOVQ $0xd6d018f5, BX
IMULQ BX, AX
MOVQ $0x52bc33fedbe4cbb5, BX
ADDQ BX, AX
CMPQ DX, $0x20
JLT after32
MOVQ AX, BX
MOVQ AX, SI
MOVQ AX, DI
MOVQ AX, R8
loop:
MOVQ (CX), R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
ADDQ R9, BX
RORQ $0x1d, BX
ADDQ DI, BX
MOVQ 8(CX), R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
ADDQ R9, SI
RORQ $0x1d, SI
ADDQ R8, SI
MOVQ 16(CX), R9
MOVQ $0x62992fc1, R10
IMULQ R10, R9
ADDQ R9, DI
RORQ $0x1d, DI
ADDQ BX, DI
MOVQ 24(CX), R9
MOVQ $0x30bc5b29, R10
IMULQ R10, R9
ADDQ R9, R8
RORQ $0x1d, R8
ADDQ SI, R8
ADDQ $0x20, CX
SUBQ $0x20, DX
CMPQ DX, $0x20
JGE loop
MOVQ BX, R9
ADDQ R8, R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
ADDQ SI, R9
RORQ $0x25, R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
XORQ R9, DI
MOVQ SI, R9
ADDQ DI, R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
ADDQ BX, R9
RORQ $0x25, R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
XORQ R9, R8
MOVQ BX, R9
ADDQ DI, R9
MOVQ $0xd6d018f5, R10
IMULQ R10, R9
ADDQ R8, R9
RORQ $0x25, R9
MOVQ $0xa2aa033b, R10
IMULQ R10, R9
XORQ R9, BX
MOVQ SI, R9
ADDQ R8, R9
MOVQ $0xa2aa033b, R8
IMULQ R8, R9
ADDQ DI, R9
RORQ $0x25, R9
MOVQ $0xd6d018f5, DI
IMULQ DI, R9
XORQ R9, SI
XORQ SI, BX
ADDQ BX, AX
after32:
CMPQ DX, $0x10
JLT after16
MOVQ (CX), BX
MOVQ $0x62992fc1, SI
IMULQ SI, BX
ADDQ AX, BX
ADDQ $0x08, CX
SUBQ $0x08, DX
RORQ $0x1d, BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
MOVQ (CX), SI
MOVQ $0x62992fc1, DI
IMULQ DI, SI
ADDQ AX, SI
ADDQ $0x08, CX
SUBQ $0x08, DX
RORQ $0x1d, SI
MOVQ $0x30bc5b29, DI
IMULQ DI, SI
MOVQ BX, DI
MOVQ $0xd6d018f5, R8
IMULQ R8, DI
RORQ $0x15, DI
ADDQ SI, DI
XORQ DI, BX
MOVQ SI, DI
MOVQ $0x30bc5b29, R8
IMULQ R8, DI
RORQ $0x15, DI
ADDQ BX, DI
XORQ DI, SI
ADDQ SI, AX
after16:
CMPQ DX, $0x08
JLT after8
MOVQ (CX), BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
ADDQ BX, AX
ADDQ $0x08, CX
SUBQ $0x08, DX
MOVQ AX, BX
RORQ $0x37, BX
MOVQ $0xa2aa033b, SI
IMULQ SI, BX
XORQ BX, AX
after8:
CMPQ DX, $0x04
JLT after4
XORQ BX, BX
MOVL (CX), BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
ADDQ BX, AX
ADDQ $0x04, CX
SUBQ $0x04, DX
MOVQ AX, BX
RORQ $0x1a, BX
MOVQ $0xa2aa033b, SI
IMULQ SI, BX
XORQ BX, AX
after4:
CMPQ DX, $0x02
JLT after2
XORQ BX, BX
MOVW (CX), BX
MOVQ $0x30bc5b29, SI
IMULQ SI, BX
ADDQ BX, AX
ADDQ $0x02, CX
SUBQ $0x02, DX
MOVQ AX, BX
RORQ $0x30, BX
MOVQ $0xa2aa033b, SI
IMULQ SI, BX
XORQ BX, AX
after2:
CMPQ DX, $0x01
JLT after1
MOVBQZX (CX), CX
MOVQ $0x30bc5b29, DX
IMULQ DX, CX
ADDQ CX, AX
MOVQ AX, CX
RORQ $0x25, CX
MOVQ $0xa2aa033b, DX
IMULQ DX, CX
XORQ CX, AX
after1:
MOVQ AX, CX
RORQ $0x1c, CX
XORQ CX, AX
MOVQ $0xd6d018f5, CX
IMULQ CX, AX
MOVQ AX, CX
RORQ $0x1d, CX
XORQ CX, AX
MOVQ AX, ret+24(FP)
RET

View File

@@ -1,10 +0,0 @@
// Code generated by command: go run asm.go -out metro_amd64.s -stubs metro_stub.go -pkg metro. DO NOT EDIT.
//go:build amd64 && gc && !purego && !noasm
package metro
//go:noescape
func Hash64(buffer []byte, seed uint64) uint64
func Hash64Str(buffer string, seed uint64) uint64

View File

@@ -1 +0,0 @@
*.swp

View File

@@ -1,23 +0,0 @@
Copyright (c) 2016, Brent Pedersen - Bioinformatics
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -1,52 +0,0 @@
Fast hashmap with integer keys for Golang
[![GoDoc](https://godoc.org/github.com/kamstrup/intmap?status.svg)](https://godoc.org/github.com/kamstrup/intmap)
[![Go Report Card](https://goreportcard.com/badge/github.com/kamstrup/intmap)](https://goreportcard.com/report/github.com/kamstrup/intmap)
# intmap
import "github.com/kamstrup/intmap"
Package intmap is a fast hashmap implementation for Golang, specialized for maps with integer type keys.
The values can be of any type.
It is a full port of https://github.com/brentp/intintmap to use type parameters (aka generics).
It interleaves keys and values in the same underlying array to improve locality.
This is also known as open addressing with linear probing.
It is up to 3X faster than the builtin map:
```
name time/op
Map64Fill-8 201ms ± 5%
IntIntMapFill-8 207ms ±31%
StdMapFill-8 371ms ±11%
Map64Get10PercentHitRate-8 148µs ±40%
IntIntMapGet10PercentHitRate-8 171µs ±50%
StdMapGet10PercentHitRate-8 171µs ±33%
Map64Get100PercentHitRate-8 4.50ms ± 5%
IntIntMapGet100PercentHitRate-8 4.82ms ± 6%
StdMapGet100PercentHitRate-8 15.5ms ±32%
```
## Usage
```go
m := intmap.New[int64,int64](32768)
m.Put(int64(1234), int64(-222))
m.Put(int64(123), int64(33))
v, ok := m.Get(int64(222))
v, ok := m.Get(int64(333))
m.Del(int64(222))
m.Del(int64(333))
fmt.Println(m.Len())
m.ForEach(func(k int64, v int64) {
fmt.Printf("key: %d, value: %d\n", k, v)
})
m.Clear() // all gone, but buffers kept
```

View File

@@ -1,458 +0,0 @@
// Package intmap contains a fast hashmap implementation for maps with keys of any integer type
package intmap
import (
"iter"
"math"
)
// IntKey is a type constraint for values that can be used as keys in Map
type IntKey interface {
~int | ~uint | ~int64 | ~uint64 | ~int32 | ~uint32 | ~int16 | ~uint16 | ~int8 | ~uint8 | ~uintptr
}
// pair represents a key-value pair in Map.
//
// It is an important detail that V is before K in the memory layout. Despite it feeling more natural to have K first!
// We must have sizeof(pair[K,struct{}]) == sizeof(K), to minimize memory consumption when using a Set.
// If V is last, then &p.V can point to invalid memory, which is not permitted. This makes the Go compiler emit
// some padding for the pair struct in that case.
// See https://github.com/kamstrup/intmap/pull/6#issuecomment-3581008879
type pair[K IntKey, V any] struct {
V V
K K
}
const fillFactorBase64 = 7
const fillFactor64 = fillFactorBase64 / 10.0
func phiMix64(x int) int {
h := int64(x) * int64(0x9E3779B9)
return int(h ^ (h >> 16))
}
// Map is a hashmap where the keys are some any integer type.
// It is valid to call methods that read a nil map, similar to a standard Go map.
// Methods valid on a nil map are Has, Get, Len, and ForEach.
type Map[K IntKey, V any] struct {
data []pair[K, V] // key-value pairs
size int
zeroVal V // value of 'zero' key
hasZeroKey bool // do we have 'zero' key in the map?
}
// New creates a new map with keys being any integer subtype.
// The map can store up to the given capacity before reallocation and rehashing occurs.
func New[K IntKey, V any](capacity int) *Map[K, V] {
return &Map[K, V]{
data: make([]pair[K, V], arraySize(capacity, fillFactor64)),
}
}
// Has checks if the given key exists in the map.
// Calling this method on a nil map will return false.
func (m *Map[K, V]) Has(key K) bool {
if m == nil {
return false
}
if key == K(0) {
return m.hasZeroKey
}
idx := m.startIndex(key)
p := m.data[idx]
if p.K == K(0) { // end of chain already
return false
}
if p.K == key { // we check zero prior to this call
return true
}
// hash collision, seek next hash match, bailing on first empty
for {
idx = m.nextIndex(idx)
p = m.data[idx]
if p.K == K(0) {
return false
}
if p.K == key {
return true
}
}
}
// Get returns the value if the key is found.
// If you just need to check for existence it is easier to use Has.
// Calling this method on a nil map will return the zero value for V and false.
func (m *Map[K, V]) Get(key K) (V, bool) {
if m == nil {
var zero V
return zero, false
}
if key == K(0) {
if m.hasZeroKey {
return m.zeroVal, true
}
var zero V
return zero, false
}
idx := m.startIndex(key)
p := m.data[idx]
if p.K == K(0) { // end of chain already
var zero V
return zero, false
}
if p.K == key { // we check zero prior to this call
return p.V, true
}
// hash collision, seek next hash match, bailing on first empty
for {
idx = m.nextIndex(idx)
p = m.data[idx]
if p.K == K(0) {
var zero V
return zero, false
}
if p.K == key {
return p.V, true
}
}
}
// Put adds or updates key with value val.
func (m *Map[K, V]) Put(key K, val V) {
if key == K(0) {
if !m.hasZeroKey {
m.size++
}
m.zeroVal = val
m.hasZeroKey = true
return
}
idx := m.startIndex(key)
p := &m.data[idx]
if p.K == K(0) { // end of chain already
p.K = key
p.V = val
if m.size >= m.sizeThreshold() {
m.rehash()
} else {
m.size++
}
return
} else if p.K == key { // overwrite existing value
p.V = val
return
}
// hash collision, seek next empty or key match
for {
idx = m.nextIndex(idx)
p = &m.data[idx]
if p.K == K(0) {
p.K = key
p.V = val
if m.size >= m.sizeThreshold() {
m.rehash()
} else {
m.size++
}
return
} else if p.K == key {
p.V = val
return
}
}
}
// PutIfNotExists adds the key-value pair only if the key does not already exist
// in the map, and returns the current value associated with the key and a boolean
// indicating whether the value was newly added or not.
func (m *Map[K, V]) PutIfNotExists(key K, val V) (V, bool) {
if key == K(0) {
if m.hasZeroKey {
return m.zeroVal, false
}
m.zeroVal = val
m.hasZeroKey = true
m.size++
return val, true
}
idx := m.startIndex(key)
p := &m.data[idx]
if p.K == K(0) { // end of chain already
p.K = key
p.V = val
m.size++
if m.size >= m.sizeThreshold() {
m.rehash()
}
return val, true
} else if p.K == key {
return p.V, false
}
// hash collision, seek next hash match, bailing on first empty
for {
idx = m.nextIndex(idx)
p = &m.data[idx]
if p.K == K(0) {
p.K = key
p.V = val
m.size++
if m.size >= m.sizeThreshold() {
m.rehash()
}
return val, true
} else if p.K == key {
return p.V, false
}
}
}
// ForEach iterates through key-value pairs in the map while the function f returns true.
// This method returns immediately if invoked on a nil map.
//
// The iteration order of a Map is not defined, so please avoid relying on it.
func (m *Map[K, V]) ForEach(f func(K, V) bool) {
if m == nil {
return
}
if m.hasZeroKey && !f(K(0), m.zeroVal) {
return
}
forEach64(m.data, f)
}
// All returns an iterator over key-value pairs from m.
// The iterator returns immediately if invoked on a nil map.
//
// The iteration order of a Map is not defined, so please avoid relying on it.
func (m *Map[K, V]) All() iter.Seq2[K, V] {
return m.ForEach
}
// Keys returns an iterator over keys in m.
// The iterator returns immediately if invoked on a nil map.
//
// The iteration order of a Map is not defined, so please avoid relying on it.
func (m *Map[K, V]) Keys() iter.Seq[K] {
return func(yield func(k K) bool) {
if m == nil {
return
}
if m.hasZeroKey && !yield(K(0)) {
return
}
for _, p := range m.data {
if p.K != K(0) && !yield(p.K) {
return
}
}
}
}
// Values returns an iterator over values in m.
// The iterator returns immediately if invoked on a nil map.
//
// The iteration order of a Map is not defined, so please avoid relying on it.
func (m *Map[K, V]) Values() iter.Seq[V] {
return func(yield func(v V) bool) {
if m == nil {
return
}
if m.hasZeroKey && !yield(m.zeroVal) {
return
}
for _, p := range m.data {
if p.K != K(0) && !yield(p.V) {
return
}
}
}
}
// Clear removes all items from the map, but keeps the internal buffers for reuse.
func (m *Map[K, V]) Clear() {
var zero V
m.hasZeroKey = false
m.zeroVal = zero
// compiles down to runtime.memclr()
for i := range m.data {
m.data[i] = pair[K, V]{}
}
m.size = 0
}
func (m *Map[K, V]) rehash() {
oldData := m.data
m.data = make([]pair[K, V], 2*len(m.data))
// reset size
if m.hasZeroKey {
m.size = 1
} else {
m.size = 0
}
forEach64(oldData, func(k K, v V) bool {
m.Put(k, v)
return true
})
}
// Len returns the number of elements in the map.
// The length of a nil map is defined to be zero.
func (m *Map[K, V]) Len() int {
if m == nil {
return 0
}
return m.size
}
func (m *Map[K, V]) sizeThreshold() int {
return int(uint64(len(m.data)) * fillFactorBase64 / 10)
}
func (m *Map[K, V]) startIndex(key K) int {
return startIndex(int(key), len(m.data))
}
func (m *Map[K, V]) nextIndex(idx int) int {
return nextIndex(idx, len(m.data))
}
func forEach64[K IntKey, V any](pairs []pair[K, V], f func(k K, v V) bool) {
for _, p := range pairs {
if p.K != K(0) && !f(p.K, p.V) {
return
}
}
}
// Del deletes a key and its value, returning true iff the key was found
func (m *Map[K, V]) Del(key K) bool {
if key == K(0) {
if m.hasZeroKey {
m.hasZeroKey = false
m.size--
return true
}
return false
}
idx := m.startIndex(key)
p := m.data[idx]
if p.K == key {
// any keys that were pushed back needs to be shifted nack into the empty slot
// to avoid breaking the chain
m.shiftKeys(idx)
m.size--
return true
} else if p.K == K(0) { // end of chain already
return false
}
for {
idx = m.nextIndex(idx)
p = m.data[idx]
if p.K == key {
// any keys that were pushed back needs to be shifted nack into the empty slot
// to avoid breaking the chain
m.shiftKeys(idx)
m.size--
return true
} else if p.K == K(0) {
return false
}
}
}
func (m *Map[K, V]) shiftKeys(idx int) int {
// Shift entries with the same hash.
// We need to do this on deletion to ensure we don't have zeroes in the hash chain
for {
var p pair[K, V]
lastIdx := idx
idx = m.nextIndex(idx)
for {
p = m.data[idx]
if p.K == K(0) {
m.data[lastIdx] = pair[K, V]{}
return lastIdx
}
slot := m.startIndex(p.K)
if lastIdx <= idx {
if lastIdx >= slot || slot > idx {
break
}
} else {
if lastIdx >= slot && slot > idx {
break
}
}
idx = m.nextIndex(idx)
}
m.data[lastIdx] = p
}
}
func nextPowerOf2(x uint32) uint32 {
if x == math.MaxUint32 {
return x
}
if x == 0 {
return 1
}
x--
x |= x >> 1
x |= x >> 2
x |= x >> 4
x |= x >> 8
x |= x >> 16
return x + 1
}
func arraySize(exp int, fill float64) int {
s := nextPowerOf2(uint32(math.Ceil(float64(exp) / fill)))
if s < 2 {
s = 2
}
return int(s)
}
func startIndex(key, len int) int {
return phiMix64(key) & (len - 1)
}
func nextIndex(idx, len int) int {
return (idx + 1) & (len - 1)
}

View File

@@ -1,59 +0,0 @@
package intmap
import "iter"
// Set is a specialization of Map modelling a set of integers.
// Like Map, methods that read from the set are valid on the nil Set.
// This include Has, Len, and ForEach.
type Set[K IntKey] Map[K, struct{}]
// NewSet creates a new Set with a given initial capacity.
func NewSet[K IntKey](capacity int) *Set[K] {
return (*Set[K])(New[K, struct{}](capacity))
}
// Add an element to the set. Returns true if the element was not already present.
func (s *Set[K]) Add(k K) bool {
_, found := (*Map[K, struct{}])(s).PutIfNotExists(k, struct{}{})
return found
}
// Del deletes a key, returning true iff the key was found
func (s *Set[K]) Del(k K) bool {
return (*Map[K, struct{}])(s).Del(k)
}
// Clear removes all items from the Set, but keeps the internal buffers for reuse.
func (s *Set[K]) Clear() {
(*Map[K, struct{}])(s).Clear()
}
// Has returns true if the key is in the set.
// If the set is nil this method always return false.
func (s *Set[K]) Has(k K) bool {
return (*Map[K, struct{}])(s).Has(k)
}
// Len returns the number of elements in the set.
// If the set is nil this method return 0.
func (s *Set[K]) Len() int {
return (*Map[K, struct{}])(s).Len()
}
// ForEach iterates over the elements in the set while the visit function returns true.
// This method returns immediately if the set is nil.
//
// The iteration order of a Set is not defined, so please avoid relying on it.
func (s *Set[K]) ForEach(visit func(k K) bool) {
(*Map[K, struct{}])(s).ForEach(func(k K, _ struct{}) bool {
return visit(k)
})
}
// All returns an iterator over keys from the set.
// The iterator returns immediately if the set is nil.
//
// The iteration order of a Set is not defined, so please avoid relying on it.
func (s *Set[K]) All() iter.Seq[K] {
return s.ForEach
}

10
vendor/modules.txt vendored
View File

@@ -292,9 +292,6 @@ github.com/aws/smithy-go/traits
github.com/aws/smithy-go/transport/http
github.com/aws/smithy-go/transport/http/internal/io
github.com/aws/smithy-go/waiter
# github.com/axiomhq/hyperloglog v0.0.0-00010101000000-000000000000 => github.com/makasim/hyperloglog v0.0.10-reuse-memory
## explicit; go 1.23
github.com/axiomhq/hyperloglog
# github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3
## explicit; go 1.20
github.com/bboreham/go-loser
@@ -336,9 +333,6 @@ github.com/davecgh/go-spew/spew
# github.com/dennwc/varint v1.0.0
## explicit; go 1.12
github.com/dennwc/varint
# github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33
## explicit
github.com/dgryski/go-metro
# github.com/envoyproxy/go-control-plane/envoy v1.37.0
## explicit; go 1.24.0
github.com/envoyproxy/go-control-plane/envoy/admin/v3
@@ -498,9 +492,6 @@ github.com/jpillora/backoff
# github.com/json-iterator/go v1.1.12
## explicit; go 1.12
github.com/json-iterator/go
# github.com/kamstrup/intmap v0.5.2
## explicit; go 1.23
github.com/kamstrup/intmap
# github.com/klauspost/compress v1.18.6
## explicit; go 1.24
github.com/klauspost/compress
@@ -1273,4 +1264,3 @@ sigs.k8s.io/structured-merge-diff/v6/value
# sigs.k8s.io/yaml v1.6.0
## explicit; go 1.22
sigs.k8s.io/yaml
# github.com/axiomhq/hyperloglog => github.com/makasim/hyperloglog v0.0.10-reuse-memory