mirror of
https://github.com/aljazceru/crawler_v2.git
synced 2025-12-17 07:24:21 +01:00
personalized walk
This commit is contained in:
@@ -5,6 +5,8 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"github/pippellia-btc/crawler/pkg/walks"
|
||||
"math/rand/v2"
|
||||
)
|
||||
|
||||
var ErrEmptyWalkStore = errors.New("the walk store is empty")
|
||||
@@ -30,7 +32,7 @@ func Global(ctx context.Context, count VisitCounter, nodes ...graph.ID) ([]float
|
||||
return nil, fmt.Errorf("Global: failed to get the visits total: %w", err)
|
||||
}
|
||||
|
||||
if total == 0 {
|
||||
if total <= 0 {
|
||||
return nil, ErrEmptyWalkStore
|
||||
}
|
||||
|
||||
@@ -47,3 +49,102 @@ func Global(ctx context.Context, count VisitCounter, nodes ...graph.ID) ([]float
|
||||
|
||||
return pageranks, nil
|
||||
}
|
||||
|
||||
// pWalk is a personalized walk, which is a random walk that resets to a specified node
|
||||
// and continues until it reaches a specified target lenght.
|
||||
type pWalk struct {
|
||||
start graph.ID // the starting node
|
||||
node graph.ID // the current node
|
||||
|
||||
ongoing walks.Walk // the current walk
|
||||
union []graph.ID // the sum of all previous walk paths
|
||||
}
|
||||
|
||||
func newPersonalizedWalk(start graph.ID, target int) *pWalk {
|
||||
return &pWalk{
|
||||
start: start,
|
||||
node: start,
|
||||
ongoing: walks.Walk{Path: []graph.ID{start}},
|
||||
union: make([]graph.ID, 0, target),
|
||||
}
|
||||
}
|
||||
|
||||
// Reached returns whether the personalized walk is long enough
|
||||
func (w *pWalk) Reached(lenght int) bool {
|
||||
return len(w.union) >= lenght
|
||||
}
|
||||
|
||||
// Reset the walk to its base state after appending the ongoing walk to the union
|
||||
func (w *pWalk) Reset() {
|
||||
w.union = append(w.union, w.ongoing.Path...)
|
||||
w.ongoing = walks.Walk{Path: []graph.ID{w.start}}
|
||||
w.node = w.start
|
||||
}
|
||||
|
||||
// WalkPool makes sure a walk is returned only once, avoiding bias in the [Personalized]
|
||||
type WalkPool interface {
|
||||
// Next returns a path that starts with the provided node
|
||||
Next(node graph.ID) ([]graph.ID, bool)
|
||||
}
|
||||
|
||||
// The personalizedWalk() function simulates a long personalized random walk
|
||||
// starting from a node with reset to itself. Whenever possible, walks from the
|
||||
// [WalkCache] are used to speed up the computation.
|
||||
func personalizedWalk(
|
||||
ctx context.Context,
|
||||
walker walks.Walker,
|
||||
pool WalkPool,
|
||||
start graph.ID,
|
||||
lenght int) ([]graph.ID, error) {
|
||||
|
||||
var path []graph.ID
|
||||
var exists bool
|
||||
walk := newPersonalizedWalk(start, lenght)
|
||||
|
||||
for {
|
||||
if walk.Reached(lenght) {
|
||||
return walk.union, nil
|
||||
}
|
||||
|
||||
if rand.Float64() > walks.Alpha {
|
||||
walk.Reset()
|
||||
continue
|
||||
}
|
||||
|
||||
path, exists = pool.Next(walk.node)
|
||||
switch exists {
|
||||
case true:
|
||||
// graft the given path
|
||||
walk.ongoing.Graft(path)
|
||||
walk.Reset()
|
||||
|
||||
case false:
|
||||
// perform one manual step
|
||||
follows, err := walker.Follows(ctx, walk.node)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(follows) == 0 {
|
||||
// found a dandling node, stop
|
||||
walk.Reset()
|
||||
continue
|
||||
}
|
||||
|
||||
node := randomElement(follows)
|
||||
if walk.ongoing.Visits(node) {
|
||||
// found a cycle, stop
|
||||
walk.Reset()
|
||||
continue
|
||||
}
|
||||
|
||||
walk.node = node
|
||||
walk.ongoing.Append(node)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns a random element of a slice. It panics if the slice is empty or nil.
|
||||
func randomElement[S []E, E any](s S) E {
|
||||
return s[rand.IntN(len(s))]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user