mirror of
https://github.com/aljazceru/crawler_v2.git
synced 2025-12-17 07:24:21 +01:00
rewrite and simplification
This commit is contained in:
31
pkg/graph/graph.go
Normal file
31
pkg/graph/graph.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package graph
|
||||
|
||||
type ID string
|
||||
|
||||
// Delta represent the changes a Node made to its follow list.
|
||||
// It Removed some nodes, and Added some others.
|
||||
// This means the old follow list is Removed + Common, while the new is Common + Added
|
||||
type Delta struct {
|
||||
Node ID
|
||||
Removed []ID
|
||||
Common []ID
|
||||
Added []ID
|
||||
}
|
||||
|
||||
func (d Delta) Old() []ID {
|
||||
return append(d.Common, d.Removed...)
|
||||
}
|
||||
|
||||
func (d Delta) New() []ID {
|
||||
return append(d.Common, d.Added...)
|
||||
}
|
||||
|
||||
// Inverse of the delta. If a delta and it's inverse are applied, the graph returns to its original state.
|
||||
func (d Delta) Inverse() Delta {
|
||||
return Delta{
|
||||
Node: d.Node,
|
||||
Common: d.Common,
|
||||
Removed: d.Added,
|
||||
Added: d.Removed,
|
||||
}
|
||||
}
|
||||
49
pkg/pagerank/pagerank.go
Normal file
49
pkg/pagerank/pagerank.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package pagerank
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
)
|
||||
|
||||
var ErrEmptyWalkStore = errors.New("the walk store is empty")
|
||||
|
||||
type VisitCounter interface {
|
||||
// TotalVisits returns the total number of visits, which is the sum of the lengths of all walks.
|
||||
TotalVisits(ctx context.Context) (int, error)
|
||||
|
||||
// Visits returns the number of times each specified node was visited during the walks.
|
||||
// The returned slice contains counts in the same order as the input nodes.
|
||||
Visits(ctx context.Context, nodes ...graph.ID) ([]int, error)
|
||||
}
|
||||
|
||||
// Global computes the global pagerank score for the specified nodes.
|
||||
// If a node is not found, its pagerank is assumed to be 0.
|
||||
func Global(ctx context.Context, count VisitCounter, nodes ...graph.ID) ([]float64, error) {
|
||||
if len(nodes) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
total, err := count.TotalVisits(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Global: failed to get the visits total: %w", err)
|
||||
}
|
||||
|
||||
if total == 0 {
|
||||
return nil, ErrEmptyWalkStore
|
||||
}
|
||||
|
||||
visits, err := count.Visits(ctx, nodes...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Global: failed to get the nodes visits: %w", err)
|
||||
}
|
||||
|
||||
pageranks := make([]float64, len(visits))
|
||||
for i, v := range visits {
|
||||
pageranks[i] = float64(v) / float64(total)
|
||||
|
||||
}
|
||||
|
||||
return pageranks, nil
|
||||
}
|
||||
91
pkg/pagerank/utils.go
Normal file
91
pkg/pagerank/utils.go
Normal file
@@ -0,0 +1,91 @@
|
||||
package pagerank
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"github/pippellia-btc/crawler/pkg/walks"
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type WalkStore struct {
|
||||
nextID int
|
||||
Walks map[walks.ID]walks.Walk
|
||||
}
|
||||
|
||||
func NewWalkStore() *WalkStore {
|
||||
return &WalkStore{
|
||||
Walks: make(map[walks.ID]walks.Walk, 1000),
|
||||
}
|
||||
}
|
||||
|
||||
// AddWalks adds walks with sequentials IDs
|
||||
func (s *WalkStore) AddWalks(w []walks.Walk) {
|
||||
for _, walk := range w {
|
||||
ID := walks.ID(strconv.Itoa(s.nextID))
|
||||
s.nextID++
|
||||
|
||||
walk.ID = ID
|
||||
s.Walks[ID] = walk
|
||||
}
|
||||
}
|
||||
|
||||
// ReplaceWalks reassigns the ID --> walk
|
||||
func (s *WalkStore) ReplaceWalks(w []walks.Walk) {
|
||||
for _, walk := range w {
|
||||
s.Walks[walk.ID] = walk
|
||||
}
|
||||
}
|
||||
|
||||
func (s *WalkStore) WalksVisiting(node graph.ID) []walks.Walk {
|
||||
visiting := make([]walks.Walk, 0, walks.N)
|
||||
for _, walk := range s.Walks {
|
||||
if walk.Visits(node) {
|
||||
visiting = append(visiting, walk)
|
||||
}
|
||||
}
|
||||
|
||||
return visiting
|
||||
}
|
||||
|
||||
func (s *WalkStore) TotalVisits(ctx context.Context) (int, error) {
|
||||
total := 0
|
||||
for _, walk := range s.Walks {
|
||||
total += walk.Len()
|
||||
}
|
||||
return total, nil
|
||||
}
|
||||
|
||||
func (s *WalkStore) Visits(ctx context.Context, nodes ...graph.ID) ([]int, error) {
|
||||
if len(nodes) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
count := make(map[graph.ID]int, len(nodes))
|
||||
for _, walk := range s.Walks {
|
||||
for _, node := range walk.Path {
|
||||
count[node]++
|
||||
}
|
||||
}
|
||||
|
||||
visits := make([]int, len(nodes))
|
||||
for i, node := range nodes {
|
||||
visits[i] = count[node]
|
||||
}
|
||||
|
||||
return visits, nil
|
||||
}
|
||||
|
||||
// Distance returns the L1 distance between two lists of ranks.
|
||||
func Distance(r1, r2 []float64) float64 {
|
||||
if len(r1) != len(r2) {
|
||||
return math.MaxFloat64
|
||||
}
|
||||
|
||||
var dist float64 = 0
|
||||
for i := range r1 {
|
||||
dist += math.Abs(r1[i] - r2[i])
|
||||
}
|
||||
|
||||
return dist
|
||||
}
|
||||
34
pkg/walks/utils.go
Normal file
34
pkg/walks/utils.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package walks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type MapWalker struct {
|
||||
follows map[graph.ID][]graph.ID
|
||||
}
|
||||
|
||||
func NewWalker(m map[graph.ID][]graph.ID) *MapWalker {
|
||||
return &MapWalker{follows: m}
|
||||
}
|
||||
|
||||
func (m *MapWalker) Follows(ctx context.Context, node graph.ID) ([]graph.ID, error) {
|
||||
return m.follows[node], nil
|
||||
}
|
||||
|
||||
func (m *MapWalker) Update(ctx context.Context, delta graph.Delta) {
|
||||
m.follows[delta.Node] = delta.New()
|
||||
}
|
||||
|
||||
func NewCyclicWalker(n int) *MapWalker {
|
||||
follows := make(map[graph.ID][]graph.ID, n)
|
||||
for i := range n {
|
||||
node := graph.ID(strconv.Itoa(i))
|
||||
next := graph.ID(strconv.Itoa((i + 1) % n))
|
||||
follows[node] = []graph.ID{next}
|
||||
}
|
||||
|
||||
return &MapWalker{follows: follows}
|
||||
}
|
||||
273
pkg/walks/walks.go
Normal file
273
pkg/walks/walks.go
Normal file
@@ -0,0 +1,273 @@
|
||||
package walks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"math/rand/v2"
|
||||
"slices"
|
||||
)
|
||||
|
||||
var (
|
||||
Alpha = 0.85 // the dampening factor
|
||||
N = 100 // the walks per node
|
||||
)
|
||||
|
||||
// ID represent how walks are identified in the storage layer
|
||||
type ID string
|
||||
|
||||
// Walk is an ordered list of node IDs
|
||||
type Walk struct {
|
||||
ID ID
|
||||
Path []graph.ID
|
||||
// Stop int
|
||||
}
|
||||
|
||||
type Walker interface {
|
||||
// Follows returns the follow-list of the specified node, which will be used in
|
||||
// generating random walks
|
||||
Follows(ctx context.Context, node graph.ID) ([]graph.ID, error)
|
||||
}
|
||||
|
||||
// Len returns the lenght of the walk
|
||||
func (w Walk) Len() int {
|
||||
return len(w.Path)
|
||||
}
|
||||
|
||||
// Visits returns whether the walk visited node
|
||||
func (w Walk) Visits(node graph.ID) bool {
|
||||
return slices.Contains(w.Path, node)
|
||||
}
|
||||
|
||||
// Index returns the index of node in the walk, or -1 if not present
|
||||
func (w Walk) Index(node graph.ID) int {
|
||||
return slices.Index(w.Path, node)
|
||||
}
|
||||
|
||||
// Copy returns a deep copy of the walk
|
||||
func (w Walk) Copy() Walk {
|
||||
path := make([]graph.ID, len(w.Path))
|
||||
copy(path, w.Path)
|
||||
return Walk{ID: w.ID, Path: path}
|
||||
}
|
||||
|
||||
// Prune the walk at the specified index (excluded).
|
||||
// It panics if the index is not within the bounds of the walk
|
||||
func (w *Walk) Prune(cut int) {
|
||||
if cut < 0 || cut > len(w.Path) {
|
||||
panic("cut index must be within the bounds of the walk")
|
||||
}
|
||||
w.Path = w.Path[:cut]
|
||||
}
|
||||
|
||||
// Graft the walk by appending a path, and removing cycles (if any)
|
||||
func (w *Walk) Graft(path []graph.ID) {
|
||||
w.Path = append(w.Path, path...)
|
||||
pos := findCycle(w.Path)
|
||||
if pos == -1 {
|
||||
return
|
||||
}
|
||||
|
||||
w.Path = w.Path[:pos]
|
||||
}
|
||||
|
||||
// Generate N random walks for the specified node, using dampening factor alpha.
|
||||
// A walk stops early if a cycle is encountered. Walk IDs will be overwritten by the storage layer.
|
||||
func Generate(ctx context.Context, walker Walker, nodes ...graph.ID) ([]Walk, error) {
|
||||
if len(nodes) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
walks := make([]Walk, 0, N*len(nodes))
|
||||
var path []graph.ID
|
||||
var err error
|
||||
|
||||
for _, node := range nodes {
|
||||
for range N {
|
||||
path, err = generate(ctx, walker, node)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to Generate: %w", err)
|
||||
}
|
||||
|
||||
walks = append(walks, Walk{Path: path})
|
||||
}
|
||||
}
|
||||
|
||||
return walks, nil
|
||||
}
|
||||
|
||||
// Generate a random path of nodes, by:
|
||||
// - starting from one of the provided nodes, chosen at random
|
||||
// - walking along the social graph
|
||||
// - stopping with probabiliy 1-alpha, on dandling nodes, and on cycles
|
||||
func generate(ctx context.Context, walker Walker, start ...graph.ID) ([]graph.ID, error) {
|
||||
if len(start) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
node := randomElement(start)
|
||||
path := make([]graph.ID, 0, averageLenght(Alpha))
|
||||
path = append(path, node)
|
||||
|
||||
for {
|
||||
if rand.Float64() > Alpha {
|
||||
break
|
||||
}
|
||||
|
||||
follows, err := walker.Follows(ctx, node)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(follows) == 0 {
|
||||
// found a dandling node, stop
|
||||
break
|
||||
}
|
||||
|
||||
node = randomElement(follows)
|
||||
if slices.Contains(path, node) {
|
||||
// found a cycle, stop
|
||||
break
|
||||
}
|
||||
|
||||
path = append(path, node)
|
||||
}
|
||||
|
||||
return path, nil
|
||||
}
|
||||
|
||||
// ToRemove returns the IDs of walks that needs to be removed.
|
||||
// It returns an error if the number of walks to remove differs from the expected [N].
|
||||
func ToRemove(node graph.ID, walks []Walk) ([]ID, error) {
|
||||
toRemove := make([]ID, 0, N)
|
||||
|
||||
for _, walk := range walks {
|
||||
if walk.Index(node) != -1 {
|
||||
toRemove = append(toRemove, walk.ID)
|
||||
}
|
||||
}
|
||||
|
||||
if len(toRemove) != N {
|
||||
return toRemove, fmt.Errorf("walks to be removed (%d) are less than expected (%d)", len(toRemove), N)
|
||||
}
|
||||
|
||||
return toRemove, nil
|
||||
}
|
||||
|
||||
func ToUpdate(ctx context.Context, walker Walker, delta graph.Delta, walks []Walk) ([]Walk, error) {
|
||||
toUpdate := make([]Walk, 0, expectedUpdates(walks, delta))
|
||||
resampleProbability := resampleProbability(delta)
|
||||
|
||||
var pos int
|
||||
var isInvalid, shouldResample bool
|
||||
|
||||
for _, walk := range walks {
|
||||
pos = walk.Index(delta.Node)
|
||||
if pos == -1 {
|
||||
// the walk doesn't visit node, skip
|
||||
continue
|
||||
}
|
||||
|
||||
shouldResample = rand.Float64() < resampleProbability
|
||||
isInvalid = (pos < walk.Len()-1) && slices.Contains(delta.Removed, walk.Path[pos+1])
|
||||
|
||||
switch {
|
||||
case shouldResample:
|
||||
// prune and graft with the added nodes to avoid oversampling of common nodes
|
||||
updated := walk.Copy()
|
||||
updated.Prune(pos + 1)
|
||||
|
||||
if rand.Float64() < Alpha {
|
||||
new, err := generate(ctx, walker, delta.Added...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ToUpdate: failed to generate new segment: %w", err)
|
||||
}
|
||||
|
||||
updated.Graft(new)
|
||||
}
|
||||
|
||||
toUpdate = append(toUpdate, updated)
|
||||
|
||||
case isInvalid:
|
||||
// prune and graft invalid steps with the common nodes
|
||||
updated := walk.Copy()
|
||||
updated.Prune(pos + 1)
|
||||
|
||||
new, err := generate(ctx, walker, delta.Common...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ToUpdate: failed to generate new segment: %w", err)
|
||||
}
|
||||
|
||||
updated.Graft(new)
|
||||
toUpdate = append(toUpdate, updated)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return toUpdate, nil
|
||||
}
|
||||
|
||||
// The resample probability that a walk needs to be changed to avoid an oversampling of common nodes.
|
||||
// Consider the simple graph 0 -> 1; all the walks that continue from 0 will reach 1.
|
||||
// Now imagine 0 added 2 and 3 to its successors;
|
||||
// Our goal is to have 1/3 of the walks that continue go to each of 1, 2 and 3.
|
||||
// This means we have to re-do 2/3 of the walks and make them continue towards 2 or 3.
|
||||
func resampleProbability(delta graph.Delta) float64 {
|
||||
if len(delta.Added) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
c := float64(len(delta.Common))
|
||||
a := float64(len(delta.Added))
|
||||
return a / (a + c)
|
||||
}
|
||||
|
||||
func expectedUpdates(walks []Walk, delta graph.Delta) int {
|
||||
if len(delta.Common) == 0 {
|
||||
// no nodes have remained, all walks must be re-computed
|
||||
return len(walks)
|
||||
}
|
||||
|
||||
r := float64(len(delta.Removed))
|
||||
c := float64(len(delta.Common))
|
||||
a := float64(len(delta.Added))
|
||||
|
||||
invalidProbability := Alpha * r / (r + c)
|
||||
resampleProbability := a / (a + c)
|
||||
updateProbability := invalidProbability + resampleProbability - invalidProbability*resampleProbability
|
||||
expectedUpdates := float64(len(walks)) * updateProbability
|
||||
return int(expectedUpdates + 0.5)
|
||||
}
|
||||
|
||||
// returns a random element of a slice. It panics if the slice is empty or nil.
|
||||
func randomElement[S []E, E any](s S) E {
|
||||
return s[rand.IntN(len(s))]
|
||||
}
|
||||
|
||||
// Find the position of the first repetition in a slice. If there are no cycles, -1 is returned
|
||||
func findCycle[S []K, K comparable](s S) int {
|
||||
seen := make(map[K]struct{})
|
||||
for i, e := range s {
|
||||
if _, ok := seen[e]; ok {
|
||||
return i
|
||||
}
|
||||
|
||||
seen[e] = struct{}{}
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func averageLenght(alpha float64) int {
|
||||
switch {
|
||||
case alpha < 0 || alpha > 1:
|
||||
panic("alpha must be between 0 and 1 (excluded)")
|
||||
|
||||
case alpha == 1:
|
||||
// this case should only happen in tests, so return a default value
|
||||
return 100
|
||||
|
||||
default:
|
||||
return int(1.0/(1-alpha) + 0.5)
|
||||
}
|
||||
}
|
||||
121
pkg/walks/walks_test.go
Normal file
121
pkg/walks/walks_test.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package walks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"math"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGenerate(t *testing.T) {
|
||||
t.Run("cyclic stop", func(t *testing.T) {
|
||||
Alpha = 1 // so walks won't stop
|
||||
walker := NewCyclicWalker(3)
|
||||
expected := Walk{Path: []graph.ID{"0", "1", "2"}}
|
||||
|
||||
walks, err := Generate(context.Background(), walker, "0")
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil error, got %v", err)
|
||||
}
|
||||
|
||||
for _, walk := range walks {
|
||||
if !reflect.DeepEqual(walk, expected) {
|
||||
t.Fatalf("expected walk %v, got %v", expected, walk)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("average lenght", func(t *testing.T) {
|
||||
maxError := 0.1
|
||||
Alpha = 0.85
|
||||
N = 10000
|
||||
|
||||
walker := NewCyclicWalker(1000)
|
||||
expectedLenght := (1.0 / (1.0 - Alpha))
|
||||
|
||||
walks, err := Generate(context.Background(), walker, "0")
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil, got %v", err)
|
||||
}
|
||||
|
||||
sumLenght := 0
|
||||
for _, walk := range walks {
|
||||
sumLenght += len(walk.Path)
|
||||
}
|
||||
|
||||
averageLenght := float64(sumLenght) / float64(N)
|
||||
if math.Abs(averageLenght-expectedLenght) > maxError {
|
||||
t.Fatalf("expected average lenght %f, got %f", expectedLenght, averageLenght)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestUpdateRemove(t *testing.T) {
|
||||
walker := NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"3"},
|
||||
"1": {"2"},
|
||||
"2": {"0"},
|
||||
"3": {"2"},
|
||||
})
|
||||
|
||||
delta := graph.Delta{
|
||||
Node: "0",
|
||||
Removed: []graph.ID{"1"}, // the old follows were "1" and "3"
|
||||
Common: []graph.ID{"3"},
|
||||
}
|
||||
|
||||
walks := []Walk{
|
||||
{ID: "0", Path: []graph.ID{"0", "1", "2"}}, // this is invalid
|
||||
{ID: "1", Path: []graph.ID{"0", "3", "2"}},
|
||||
}
|
||||
|
||||
Alpha = 1 // avoid early stopping, which makes the test deterministic
|
||||
expected := []Walk{{ID: "0", Path: []graph.ID{"0", "3", "2"}}}
|
||||
|
||||
toUpdate, err := ToUpdate(context.Background(), walker, delta, walks)
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil, got %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(toUpdate, expected) {
|
||||
t.Errorf("expected %v, got %v", expected, toUpdate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindCycle(t *testing.T) {
|
||||
tests := []struct {
|
||||
list []graph.ID
|
||||
expected int
|
||||
}{
|
||||
{list: []graph.ID{"0", "1", "2", "3", "4", "5"}, expected: -1},
|
||||
{list: []graph.ID{"0", "1", "2", "3", "1", "5"}, expected: 4},
|
||||
{list: []graph.ID{"0", "1", "2", "3", "1", "0"}, expected: 4},
|
||||
{list: []graph.ID{"0", "1", "3", "3", "4", "5"}, expected: 3},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
if pos := findCycle(test.list); pos != test.expected {
|
||||
t.Fatalf("list %v; expected %d, got %d", test.list, test.expected, pos)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkFindCycle(b *testing.B) {
|
||||
sizes := []int{10, 100, 1000}
|
||||
for _, size := range sizes {
|
||||
b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
|
||||
path := make([]graph.ID, size)
|
||||
for i := range size {
|
||||
path[i] = graph.ID(strconv.Itoa(i))
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
findCycle(path)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
128
tests/random/pagerank_test.go
Normal file
128
tests/random/pagerank_test.go
Normal file
@@ -0,0 +1,128 @@
|
||||
package random_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github/pippellia-btc/crawler/pkg/pagerank"
|
||||
"github/pippellia-btc/crawler/pkg/walks"
|
||||
"math/rand/v2"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPagerankStatic(t *testing.T) {
|
||||
expectedDistance := 0.01
|
||||
walks.Alpha = 0.85
|
||||
walks.N = 5000
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
Setup
|
||||
}{
|
||||
{name: "all dandling nodes", Setup: Dandlings(11)},
|
||||
{name: "triangle graph", Setup: Triangle},
|
||||
{name: "long cycle", Setup: Cyclic(30)},
|
||||
{name: "acyclic graph 1", Setup: Acyclic1},
|
||||
{name: "acyclic graph 2", Setup: Acyclic2},
|
||||
{name: "acyclic graph 3", Setup: Acyclic3},
|
||||
{name: "acyclic graph 4", Setup: Acyclic4},
|
||||
{name: "acyclic graph 5", Setup: Acyclic5},
|
||||
{name: "acyclic graph 6", Setup: Acyclic6},
|
||||
{name: "acyclic graph 7", Setup: Acyclic7},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := pagerank.NewWalkStore()
|
||||
|
||||
walks, err := walks.Generate(ctx, test.walker, test.nodes...)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to generate the walks: %v", err)
|
||||
}
|
||||
store.AddWalks(walks)
|
||||
|
||||
ranks, err := pagerank.Global(ctx, store, test.nodes...)
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil, pr %v", err)
|
||||
}
|
||||
|
||||
distance := pagerank.Distance(ranks, test.ranks)
|
||||
if distance > expectedDistance {
|
||||
t.Errorf("expected distance %f, got %f\n", expectedDistance, distance)
|
||||
t.Errorf("expected ranks %v, got %v", test.ranks, ranks)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
It is a known phenomenon that [walks.ToUpdate] does not return correct results
|
||||
when the probability of cycles involving node --> removed is high.
|
||||
|
||||
Therefore, we only test with acyclic graphs, or graphs large enough that the
|
||||
probability of such cycles is very low.
|
||||
*/
|
||||
func TestPagerankDynamic(t *testing.T) {
|
||||
expectedDistance := 0.01
|
||||
walks.Alpha = 0.85
|
||||
walks.N = 5000
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
Setup
|
||||
}{
|
||||
{name: "all dandling nodes", Setup: Dandlings(11)},
|
||||
{name: "long cycle", Setup: Cyclic(50)},
|
||||
{name: "acyclic graph 1", Setup: Acyclic1},
|
||||
{name: "acyclic graph 2", Setup: Acyclic2},
|
||||
{name: "acyclic graph 3", Setup: Acyclic3},
|
||||
{name: "acyclic graph 4", Setup: Acyclic4},
|
||||
{name: "acyclic graph 5", Setup: Acyclic5},
|
||||
{name: "acyclic graph 6", Setup: Acyclic6},
|
||||
{name: "acyclic graph 7", Setup: Acyclic7},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
store := pagerank.NewWalkStore()
|
||||
|
||||
// apply a random delta to the graph
|
||||
delta := randomElement(test.deltas)
|
||||
test.walker.Update(ctx, delta)
|
||||
|
||||
rwalks, err := walks.Generate(ctx, test.walker, test.nodes...)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to generate the walks: %v", err)
|
||||
}
|
||||
|
||||
store.AddWalks(rwalks)
|
||||
rwalks = store.WalksVisiting(delta.Node)
|
||||
|
||||
// apply the opposite delta, returning to the original state
|
||||
inv := delta.Inverse()
|
||||
test.walker.Update(ctx, inv)
|
||||
|
||||
toUpdate, err := walks.ToUpdate(ctx, test.walker, inv, rwalks)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to update the walks: %v", err)
|
||||
}
|
||||
store.ReplaceWalks(toUpdate)
|
||||
|
||||
ranks, err := pagerank.Global(ctx, store, test.nodes...)
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil, pr %v", err)
|
||||
}
|
||||
|
||||
distance := pagerank.Distance(ranks, test.ranks)
|
||||
if distance > expectedDistance {
|
||||
t.Errorf("inverse delta %v; expected distance %f, got %f\n", inv, expectedDistance, distance)
|
||||
t.Errorf("expected ranks %v,\n got %v", test.ranks, ranks)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// returns a random element of a slice. It panics if the slice is empty or nil.
|
||||
func randomElement[S []E, E any](s S) E {
|
||||
return s[rand.IntN(len(s))]
|
||||
}
|
||||
284
tests/random/utils_test.go
Normal file
284
tests/random/utils_test.go
Normal file
@@ -0,0 +1,284 @@
|
||||
package random_test
|
||||
|
||||
import (
|
||||
"github/pippellia-btc/crawler/pkg/graph"
|
||||
"github/pippellia-btc/crawler/pkg/walks"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type Setup struct {
|
||||
walker *walks.MapWalker
|
||||
nodes []graph.ID
|
||||
ranks []float64
|
||||
deltas []graph.Delta
|
||||
}
|
||||
|
||||
// Dandlings returns a setup consisting of n dandling nodes
|
||||
func Dandlings(n int) Setup {
|
||||
nodes := make([]graph.ID, n)
|
||||
ranks := make([]float64, n)
|
||||
|
||||
added := make([]graph.ID, 0, n-1)
|
||||
deltas := make([]graph.Delta, 0, n-1)
|
||||
|
||||
for i := range n {
|
||||
node := graph.ID(strconv.Itoa(i))
|
||||
nodes[i] = node
|
||||
ranks[i] = 1.0 / float64(n)
|
||||
|
||||
if i > 0 {
|
||||
// all the possible deltas modulo graph isomorphism; 0 --> [1,2, ... k] for 1 <= k <= n
|
||||
added = append(added, node)
|
||||
deltas = append(deltas, graph.Delta{Node: "0", Added: added})
|
||||
}
|
||||
}
|
||||
|
||||
return Setup{
|
||||
walker: walks.NewWalker(make(map[graph.ID][]graph.ID)),
|
||||
nodes: nodes,
|
||||
ranks: ranks,
|
||||
deltas: deltas,
|
||||
}
|
||||
}
|
||||
|
||||
// Cyclic returns a setup consisting of a single cycle of n nodes.
|
||||
func Cyclic(n int) Setup {
|
||||
mid := graph.ID(strconv.Itoa(n / 2))
|
||||
nodes := make([]graph.ID, n)
|
||||
ranks := make([]float64, n)
|
||||
|
||||
for i := range n {
|
||||
nodes[i] = graph.ID(strconv.Itoa(i))
|
||||
ranks[i] = 1.0 / float64(n)
|
||||
}
|
||||
|
||||
return Setup{
|
||||
walker: walks.NewCyclicWalker(n),
|
||||
nodes: nodes,
|
||||
ranks: ranks,
|
||||
deltas: []graph.Delta{
|
||||
{Node: "0", Removed: []graph.ID{"1"}},
|
||||
{Node: "0", Common: []graph.ID{"1"}, Added: []graph.ID{mid}},
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Added: []graph.ID{mid}},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
var Triangle = Cyclic(3)
|
||||
|
||||
var Acyclic1 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"1", "2"},
|
||||
"1": {},
|
||||
"2": {"3"},
|
||||
"3": {"1"},
|
||||
"4": {},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3", "4"},
|
||||
ranks: []float64{0.11185, 0.36950, 0.15943, 0.24736, 0.11185},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}},
|
||||
{Node: "0", Removed: []graph.ID{"2"}, Common: []graph.ID{"1"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}},
|
||||
{Node: "2", Removed: []graph.ID{"3"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}},
|
||||
// additions
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"4"}},
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3", "4"}},
|
||||
{Node: "4", Added: []graph.ID{"0"}},
|
||||
{Node: "4", Added: []graph.ID{"1"}},
|
||||
{Node: "4", Added: []graph.ID{"2"}},
|
||||
{Node: "4", Added: []graph.ID{"3"}},
|
||||
{Node: "4", Added: []graph.ID{"1", "2"}},
|
||||
{Node: "4", Added: []graph.ID{"2", "3"}},
|
||||
{Node: "4", Added: []graph.ID{"3", "4"}},
|
||||
{Node: "4", Added: []graph.ID{"0", "1", "2"}},
|
||||
{Node: "4", Added: []graph.ID{"0", "1", "2", "3"}},
|
||||
// removals and additions
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"4"}},
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}, Added: []graph.ID{"4"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}, Added: []graph.ID{"3", "4"}},
|
||||
{Node: "2", Removed: []graph.ID{"3"}, Added: []graph.ID{"1"}},
|
||||
{Node: "2", Removed: []graph.ID{"3"}, Added: []graph.ID{"4"}},
|
||||
{Node: "2", Removed: []graph.ID{"3"}, Added: []graph.ID{"1", "4"}},
|
||||
},
|
||||
}
|
||||
|
||||
var Acyclic2 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"1", "2"},
|
||||
"1": {},
|
||||
"2": {},
|
||||
"3": {},
|
||||
"4": {"3", "5"},
|
||||
"5": {},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3", "4", "5"},
|
||||
ranks: []float64{0.12987, 0.18506, 0.18506, 0.18506, 0.12987, 0.18506},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}},
|
||||
// additions
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"4"}},
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3", "4"}},
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3", "5"}},
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3", "4", "5"}},
|
||||
// removals and additions
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"4"}},
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3", "4"}},
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3", "5"}},
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3", "4", "5"}},
|
||||
},
|
||||
}
|
||||
|
||||
var Acyclic3 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"1", "2"},
|
||||
"1": {},
|
||||
"2": {},
|
||||
"3": {"1", "2"},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3"},
|
||||
ranks: []float64{0.17544, 0.32456, 0.32456, 0.17544},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}},
|
||||
// additions
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "2", Added: []graph.ID{"1"}},
|
||||
// removals and additions
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
},
|
||||
}
|
||||
|
||||
var Acyclic4 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"1", "2"},
|
||||
"1": {},
|
||||
"2": {},
|
||||
"3": {"1"},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3"},
|
||||
ranks: []float64{0.17544, 0.39912, 0.25, 0.17544},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}},
|
||||
// additions
|
||||
{Node: "0", Common: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "2", Added: []graph.ID{"1"}},
|
||||
{Node: "2", Added: []graph.ID{"3"}},
|
||||
{Node: "3", Common: []graph.ID{"1"}, Added: []graph.ID{"0"}},
|
||||
// removals and additions
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}, Added: []graph.ID{"3"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}, Added: []graph.ID{"0"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}, Added: []graph.ID{"0", "2"}},
|
||||
},
|
||||
}
|
||||
|
||||
var Acyclic5 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"3"},
|
||||
"1": {"0"},
|
||||
"2": {},
|
||||
"3": {"2"},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3"},
|
||||
ranks: []float64{0.21489, 0.11616, 0.37015, 0.29881},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"3"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}},
|
||||
{Node: "3", Removed: []graph.ID{"2"}},
|
||||
// additions
|
||||
{Node: "0", Common: []graph.ID{"3"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Common: []graph.ID{"0"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Common: []graph.ID{"0"}, Added: []graph.ID{"3"}},
|
||||
{Node: "1", Common: []graph.ID{"0"}, Added: []graph.ID{"2", "3"}},
|
||||
// removals and additions
|
||||
{Node: "0", Removed: []graph.ID{"3"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}, Added: []graph.ID{"3"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}, Added: []graph.ID{"2", "3"}},
|
||||
},
|
||||
}
|
||||
|
||||
var Acyclic6 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"4"},
|
||||
"1": {"0"},
|
||||
"2": {},
|
||||
"3": {"1", "4"},
|
||||
"4": {"2"},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3", "4"},
|
||||
ranks: []float64{0.18820, 0.12128, 0.32417, 0.08511, 0.28125},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"4"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}, Common: []graph.ID{"4"}},
|
||||
{Node: "3", Removed: []graph.ID{"4"}, Common: []graph.ID{"1"}},
|
||||
{Node: "3", Removed: []graph.ID{"1", "4"}},
|
||||
{Node: "4", Removed: []graph.ID{"2"}},
|
||||
// additions
|
||||
{Node: "0", Common: []graph.ID{"4"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Common: []graph.ID{"0"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Common: []graph.ID{"0"}, Added: []graph.ID{"4"}},
|
||||
{Node: "1", Common: []graph.ID{"0"}, Added: []graph.ID{"2", "4"}},
|
||||
{Node: "3", Common: []graph.ID{"1", "4"}, Added: []graph.ID{"0"}},
|
||||
{Node: "3", Common: []graph.ID{"1", "4"}, Added: []graph.ID{"2"}},
|
||||
{Node: "3", Common: []graph.ID{"1", "4"}, Added: []graph.ID{"0", "2"}},
|
||||
// removals and additions
|
||||
{Node: "0", Removed: []graph.ID{"4"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}, Added: []graph.ID{"2"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}, Added: []graph.ID{"4"}},
|
||||
{Node: "1", Removed: []graph.ID{"0"}, Added: []graph.ID{"2", "4"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}, Common: []graph.ID{"4"}, Added: []graph.ID{"0"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}, Common: []graph.ID{"4"}, Added: []graph.ID{"2"}},
|
||||
{Node: "3", Removed: []graph.ID{"1"}, Common: []graph.ID{"4"}, Added: []graph.ID{"0", "2"}},
|
||||
{Node: "3", Removed: []graph.ID{"4"}, Common: []graph.ID{"1"}, Added: []graph.ID{"0"}},
|
||||
{Node: "3", Removed: []graph.ID{"4"}, Common: []graph.ID{"1"}, Added: []graph.ID{"2"}},
|
||||
{Node: "3", Removed: []graph.ID{"4"}, Common: []graph.ID{"1"}, Added: []graph.ID{"0", "2"}},
|
||||
{Node: "3", Removed: []graph.ID{"1", "4"}, Added: []graph.ID{"0"}},
|
||||
{Node: "3", Removed: []graph.ID{"1", "4"}, Added: []graph.ID{"2"}},
|
||||
{Node: "3", Removed: []graph.ID{"1", "4"}, Added: []graph.ID{"0", "2"}},
|
||||
},
|
||||
}
|
||||
|
||||
var Acyclic7 = Setup{
|
||||
walker: walks.NewWalker(map[graph.ID][]graph.ID{
|
||||
"0": {"1", "2", "3"},
|
||||
"1": {},
|
||||
"2": {},
|
||||
"3": {},
|
||||
"4": {"0", "1", "2", "3"},
|
||||
}),
|
||||
nodes: []graph.ID{"0", "1", "2", "3", "4"},
|
||||
ranks: []float64{0.17622, 0.22615, 0.22615, 0.22615, 0.14534},
|
||||
deltas: []graph.Delta{
|
||||
// removals
|
||||
{Node: "0", Removed: []graph.ID{"1"}, Common: []graph.ID{"2", "3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2"}, Common: []graph.ID{"3"}},
|
||||
{Node: "0", Removed: []graph.ID{"1", "2", "3"}},
|
||||
{Node: "4", Removed: []graph.ID{"0"}, Common: []graph.ID{"1", "2", "3"}},
|
||||
{Node: "4", Removed: []graph.ID{"1"}, Common: []graph.ID{"0", "2", "3"}},
|
||||
{Node: "4", Removed: []graph.ID{"1", "2"}, Common: []graph.ID{"0", "3"}},
|
||||
{Node: "4", Removed: []graph.ID{"1", "2", "3"}, Common: []graph.ID{"0"}},
|
||||
{Node: "4", Removed: []graph.ID{"0", "1", "2", "3"}},
|
||||
// additions
|
||||
{Node: "1", Added: []graph.ID{"2"}},
|
||||
{Node: "1", Added: []graph.ID{"2", "3"}},
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user