mirror of
https://github.com/aljazceru/crawler_v2.git
synced 2025-12-17 07:24:21 +01:00
implemented cached walker
This commit is contained in:
@@ -1,8 +1,11 @@
|
||||
package walks
|
||||
|
||||
import (
|
||||
"container/list"
|
||||
"context"
|
||||
"fmt"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"log"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
@@ -38,37 +41,163 @@ func NewCyclicWalker(n int) *SimpleWalker {
|
||||
return &SimpleWalker{follows: follows}
|
||||
}
|
||||
|
||||
// CachedWalker is a walker with optional fallback that stores follow relationships
|
||||
// CachedWalker is a [Walker] with optional fallback that stores follow relationships
|
||||
// in a compact format (uint32) for reduced memory footprint.
|
||||
// If its size grows larger than capacity, the least recently used (LRU) key is evicted.
|
||||
// It is not safe for concurrent use.
|
||||
type CachedWalker struct {
|
||||
follows map[graph.ID][]graph.ID
|
||||
lookup map[uint32]*list.Element
|
||||
|
||||
// newest at the front, oldest at the back
|
||||
edgeList *list.List
|
||||
capacity int
|
||||
|
||||
// for stats
|
||||
calls, hits, misses int
|
||||
|
||||
fallback Walker
|
||||
}
|
||||
|
||||
func NewCachedWalker(nodes []graph.ID, follows [][]graph.ID, fallback Walker) *CachedWalker {
|
||||
w := CachedWalker{
|
||||
follows: make(map[graph.ID][]graph.ID, len(nodes)),
|
||||
fallback: fallback,
|
||||
type Option func(*CachedWalker)
|
||||
|
||||
func WithCapacity(cap int) Option { return func(c *CachedWalker) { c.capacity = cap } }
|
||||
func WithFallback(f Walker) Option { return func(c *CachedWalker) { c.fallback = f } }
|
||||
|
||||
func NewWalker(opts ...Option) *CachedWalker {
|
||||
c := &CachedWalker{
|
||||
lookup: make(map[uint32]*list.Element, 10000),
|
||||
edgeList: list.New(),
|
||||
}
|
||||
|
||||
for _, opt := range opts {
|
||||
opt(c)
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
type edges struct {
|
||||
node uint32
|
||||
follows []uint32
|
||||
}
|
||||
|
||||
// Add compresses node and follows and adds them to the cache.
|
||||
// It evicts the LRU element if the capacity has been exeeded.
|
||||
func (c *CachedWalker) Add(node graph.ID, follows []graph.ID) error {
|
||||
ID, err := compactID(node)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compress node %s: %w", node, err)
|
||||
}
|
||||
|
||||
IDs, err := compactIDs(follows)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compress follows of node %s: %w", node, err)
|
||||
}
|
||||
|
||||
c.add(ID, IDs)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Add node and follows as edges. It evicts the LRU element if the capacity has been exeeded.
|
||||
func (c *CachedWalker) add(node uint32, follows []uint32) {
|
||||
c.lookup[node] = c.edgeList.PushFront(
|
||||
edges{node: node, follows: follows},
|
||||
)
|
||||
|
||||
if c.Size() > c.capacity {
|
||||
oldest := c.edgeList.Back()
|
||||
c.edgeList.Remove(oldest)
|
||||
delete(c.lookup, oldest.Value.(edges).node)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *CachedWalker) Size() int {
|
||||
return c.edgeList.Len()
|
||||
}
|
||||
|
||||
func (c *CachedWalker) logStats() {
|
||||
log.Printf("cache: calls %d, hits %d, misses %d", c.calls, c.hits, c.misses)
|
||||
c.calls, c.hits, c.misses = 0, 0, 0
|
||||
}
|
||||
|
||||
func (c *CachedWalker) Follows(ctx context.Context, node graph.ID) ([]graph.ID, error) {
|
||||
ID, err := compactID(node)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch follows of %s: %w", node, err)
|
||||
}
|
||||
|
||||
c.calls++
|
||||
if c.calls > 10000 {
|
||||
defer c.logStats()
|
||||
}
|
||||
|
||||
element, hit := c.lookup[ID]
|
||||
if hit {
|
||||
c.hits++
|
||||
c.edgeList.MoveToFront(element)
|
||||
return nodes(element.Value.(edges).follows), nil
|
||||
}
|
||||
|
||||
c.misses++
|
||||
if c.fallback == nil {
|
||||
return nil, fmt.Errorf("%w: %s", graph.ErrNodeNotFound, node)
|
||||
}
|
||||
|
||||
follows, err := c.fallback.Follows(ctx, node)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
IDs, err := compactIDs(follows)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch follows of %s: %w", node, err)
|
||||
}
|
||||
|
||||
c.add(ID, IDs)
|
||||
return follows, nil
|
||||
}
|
||||
|
||||
func (c *CachedWalker) Load(nodes []graph.ID, follows [][]graph.ID) error {
|
||||
if len(nodes) != len(follows) {
|
||||
return fmt.Errorf("failed to load: nodes and follows must have the same lenght")
|
||||
}
|
||||
|
||||
for i, node := range nodes {
|
||||
w.follows[node] = follows[i]
|
||||
if err := c.Add(node, follows[i]); err != nil {
|
||||
return fmt.Errorf("failed to load: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return &w
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *CachedWalker) Follows(ctx context.Context, node graph.ID) ([]graph.ID, error) {
|
||||
follows, exists := w.follows[node]
|
||||
if !exists {
|
||||
var err error
|
||||
follows, err = w.fallback.Follows(ctx, node)
|
||||
func compactID(node graph.ID) (uint32, error) {
|
||||
ID, err := strconv.ParseUint(string(node), 10, 32)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return uint32(ID), err
|
||||
}
|
||||
|
||||
func compactIDs(nodes []graph.ID) ([]uint32, error) {
|
||||
IDs := make([]uint32, len(nodes))
|
||||
var err error
|
||||
for i, node := range nodes {
|
||||
IDs[i], err = compactID(node)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
w.follows[node] = follows
|
||||
}
|
||||
|
||||
return follows, nil
|
||||
return IDs, nil
|
||||
}
|
||||
|
||||
func node(ID uint32) graph.ID {
|
||||
return graph.ID(strconv.FormatUint(uint64(ID), 10))
|
||||
}
|
||||
|
||||
func nodes(IDs []uint32) []graph.ID {
|
||||
nodes := make([]graph.ID, len(IDs))
|
||||
for i, ID := range IDs {
|
||||
nodes[i] = node(ID)
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user