mirror of
https://github.com/aljazceru/crawler_v2.git
synced 2025-12-17 07:24:21 +01:00
314 lines
8.2 KiB
Go
314 lines
8.2 KiB
Go
// The pagerank package uses the random walks to compute graph algorithms like
|
|
// global and personalized pageranks.
|
|
package pagerank
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"github/pippellia-btc/crawler/pkg/graph"
|
|
"github/pippellia-btc/crawler/pkg/walks"
|
|
"math/rand/v2"
|
|
)
|
|
|
|
var ErrEmptyWalkStore = errors.New("the walk store is empty")
|
|
|
|
type VisitCounter interface {
|
|
// TotalVisits returns the total number of visits, which is the sum of the lengths of all walks.
|
|
TotalVisits(ctx context.Context) (int, error)
|
|
|
|
// Visits returns the number of times each specified node was visited during the walks.
|
|
// The returned slice contains counts in the same order as the input nodes.
|
|
// If a node is not found, it returns 0 visits.
|
|
Visits(ctx context.Context, nodes ...graph.ID) ([]int, error)
|
|
}
|
|
|
|
// Global computes the global pagerank score for each target node, as the frequency of visits.
|
|
// If a node is not found, its pagerank is assumed to be 0.
|
|
func Global(ctx context.Context, count VisitCounter, targets ...graph.ID) ([]float64, error) {
|
|
if len(targets) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
total, err := count.TotalVisits(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Global: failed to get the visits total: %w", err)
|
|
}
|
|
|
|
if total <= 0 {
|
|
return nil, ErrEmptyWalkStore
|
|
}
|
|
|
|
visits, err := count.Visits(ctx, targets...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Global: failed to get the nodes visits: %w", err)
|
|
}
|
|
|
|
pageranks := make([]float64, len(visits))
|
|
for i, v := range visits {
|
|
pageranks[i] = float64(v) / float64(total)
|
|
|
|
}
|
|
|
|
return pageranks, nil
|
|
}
|
|
|
|
type PersonalizedLoader interface {
|
|
// Follows returns the follow-list of the node
|
|
Follows(ctx context.Context, node graph.ID) ([]graph.ID, error)
|
|
|
|
// BulkFollows returns the follow-lists of the specified nodes
|
|
BulkFollows(ctx context.Context, nodes []graph.ID) ([][]graph.ID, error)
|
|
|
|
// WalksVisitingAny returns up to limit walks that visit the specified nodes.
|
|
// The walks are distributed evenly among the nodes:
|
|
// - if limit == -1, all walks are returned.
|
|
// - if limit < len(nodes), no walks are returned
|
|
WalksVisitingAny(ctx context.Context, nodes []graph.ID, limit int) ([]walks.Walk, error)
|
|
}
|
|
|
|
func PersonalizedWithTargets(
|
|
ctx context.Context,
|
|
loader PersonalizedLoader,
|
|
source graph.ID,
|
|
targets []graph.ID,
|
|
targetLenght int) ([]float64, error) {
|
|
|
|
if len(targets) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
pp, err := Personalized(ctx, loader, source, targetLenght)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pageranks := make([]float64, len(targets))
|
|
for i, t := range targets {
|
|
pageranks[i] = pp[t]
|
|
}
|
|
|
|
return pageranks, nil
|
|
}
|
|
|
|
/*
|
|
Personalized computes the personalized pagerank of node by simulating a
|
|
long random walk starting at and resetting to itself. This long walk is generated
|
|
using the random walks in the storage layer whenever possible.
|
|
|
|
# REFERENCES
|
|
|
|
[1] B. Bahmani, A. Chowdhury, A. Goel; "Fast Incremental and Personalized PageRank"
|
|
URL: http://snap.stanford.edu/class/cs224w-readings/bahmani10pagerank.pdf
|
|
*/
|
|
func Personalized(
|
|
ctx context.Context,
|
|
loader PersonalizedLoader,
|
|
source graph.ID,
|
|
targetLenght int) (map[graph.ID]float64, error) {
|
|
|
|
follows, err := loader.Follows(ctx, source)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Personalized: failed to fetch the follows of source: %w", err)
|
|
}
|
|
|
|
if len(follows) == 0 {
|
|
// the special distribution of a dandling node
|
|
return map[graph.ID]float64{source: 1.0}, nil
|
|
}
|
|
|
|
bulk, err := loader.BulkFollows(ctx, follows)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Personalized: failed to fetch the two-hop network of source: %w", err)
|
|
}
|
|
|
|
walker := walks.NewWalker(
|
|
walks.WithCapacity(10000),
|
|
walks.WithFallback(loader),
|
|
)
|
|
|
|
if err := walker.Load(follows, bulk); err != nil {
|
|
return nil, fmt.Errorf("Personalized: failed to load the two-hop network of source: %w", err)
|
|
}
|
|
|
|
targetWalks := int(float64(targetLenght) * (1 - walks.Alpha))
|
|
visiting, err := loader.WalksVisitingAny(ctx, append(follows, source), targetWalks)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Personalized: failed to fetch the walk: %w", err)
|
|
}
|
|
|
|
pool := newWalkPool(visiting)
|
|
|
|
walk, err := personalizedWalk(ctx, walker, pool, source, targetLenght)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return frequencyMap(walk), nil
|
|
}
|
|
|
|
// pWalk is a personalized walk, which is a random walk that resets to a specified node
|
|
// and continues until it reaches a specified target lenght.
|
|
type pWalk struct {
|
|
start graph.ID // the starting node
|
|
node graph.ID // the current node
|
|
|
|
ongoing walks.Walk // the current walk
|
|
union []graph.ID // the sum of all previous walk paths
|
|
}
|
|
|
|
func newPersonalizedWalk(start graph.ID, target int) *pWalk {
|
|
return &pWalk{
|
|
start: start,
|
|
node: start,
|
|
ongoing: walks.Walk{Path: []graph.ID{start}},
|
|
union: make([]graph.ID, 0, target),
|
|
}
|
|
}
|
|
|
|
// Reached returns whether the personalized walk is long enough
|
|
func (w *pWalk) Reached(lenght int) bool {
|
|
return len(w.union) >= lenght
|
|
}
|
|
|
|
// Reset the walk to its base state after appending the ongoing walk to the union
|
|
func (w *pWalk) Reset() {
|
|
w.union = append(w.union, w.ongoing.Path...)
|
|
w.ongoing = walks.Walk{Path: []graph.ID{w.start}}
|
|
w.node = w.start
|
|
}
|
|
|
|
type walkPool struct {
|
|
walks []walks.Walk
|
|
|
|
// walkIndexes maps each node to the list of indices in walks where that node appears.
|
|
// That is, if walkIndexes[v] = [0, 2], then v is visited by walks[0] and walks[2].
|
|
walkIndexes map[graph.ID][]int
|
|
}
|
|
|
|
func newWalkPool(walks []walks.Walk) *walkPool {
|
|
walkIndexes := make(map[graph.ID][]int, len(walks)/100)
|
|
|
|
// add the index of the walk to each node that it visits
|
|
// excluding the last one which will be cropped out anyway.
|
|
for i, walk := range walks {
|
|
for j := range walk.Len() - 1 {
|
|
node := walk.Path[j]
|
|
walkIndexes[node] = append(walkIndexes[node], i)
|
|
}
|
|
}
|
|
|
|
return &walkPool{
|
|
walks: walks,
|
|
walkIndexes: walkIndexes,
|
|
}
|
|
}
|
|
|
|
// Next returns a path of nodes that starts immediately after node, making sure
|
|
// that the same walk is only used once to avoid bias in the sampling.
|
|
// For example, if the walk is [0,1,2,3,4], node = 1, it returns [2,3,4].
|
|
func (w *walkPool) Next(node graph.ID) ([]graph.ID, bool) {
|
|
indexes, exists := w.walkIndexes[node]
|
|
if !exists || len(indexes) == 0 {
|
|
return nil, false
|
|
}
|
|
|
|
for i, idx := range indexes {
|
|
walk := w.walks[idx]
|
|
cut := walk.Index(node)
|
|
if cut == -1 {
|
|
// walk already used, skip
|
|
continue
|
|
}
|
|
|
|
// zero the walk so it can't be reused, and reslice the walk indexes
|
|
// so we don't spend time looking at walks already used.
|
|
w.walks[idx].Path = nil
|
|
w.walkIndexes[node] = indexes[i+1:]
|
|
return walk.Path[cut+1:], true
|
|
}
|
|
|
|
// all walks where already used
|
|
delete(w.walkIndexes, node)
|
|
return nil, false
|
|
}
|
|
|
|
// The personalizedWalk() function simulates a long personalized random walk
|
|
// starting from a node with reset to itself. Whenever possible, walks from the
|
|
// [WalkCache] are used to speed up the computation.
|
|
func personalizedWalk(
|
|
ctx context.Context,
|
|
walker walks.Walker,
|
|
pool *walkPool,
|
|
start graph.ID,
|
|
lenght int) ([]graph.ID, error) {
|
|
|
|
var path []graph.ID
|
|
var exists bool
|
|
walk := newPersonalizedWalk(start, lenght)
|
|
|
|
for {
|
|
if walk.Reached(lenght) {
|
|
return walk.union, nil
|
|
}
|
|
|
|
if rand.Float64() > walks.Alpha {
|
|
walk.Reset()
|
|
continue
|
|
}
|
|
|
|
path, exists = pool.Next(walk.node)
|
|
switch exists {
|
|
case true:
|
|
// use the pre-computed walk when available
|
|
walk.ongoing.Graft(path)
|
|
walk.Reset()
|
|
|
|
case false:
|
|
// perform one manual step
|
|
follows, err := walker.Follows(ctx, walk.node)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(follows) == 0 {
|
|
// found a dandling node, stop
|
|
walk.Reset()
|
|
continue
|
|
}
|
|
|
|
node := randomElement(follows)
|
|
if walk.ongoing.Visits(node) {
|
|
// found a cycle, stop
|
|
walk.Reset()
|
|
continue
|
|
}
|
|
|
|
walk.node = node
|
|
walk.ongoing.Append(node)
|
|
}
|
|
}
|
|
}
|
|
|
|
// frequencyMap returns a map node --> frequency of visits.
|
|
func frequencyMap(path []graph.ID) map[graph.ID]float64 {
|
|
if len(path) == 0 {
|
|
return nil
|
|
}
|
|
|
|
total := len(path)
|
|
freq := 1.0 / float64(total)
|
|
freqs := make(map[graph.ID]float64, total/100)
|
|
|
|
for _, node := range path {
|
|
freqs[node] += freq
|
|
}
|
|
|
|
return freqs
|
|
}
|
|
|
|
// returns a random element of a slice. It panics if the slice is empty or nil.
|
|
func randomElement[S []E, E any](s S) E {
|
|
return s[rand.IntN(len(s))]
|
|
}
|