Files
crawler_v2/tests/e2e/e2e_test.go
2025-06-12 15:55:29 +02:00

169 lines
4.1 KiB
Go

package e2e_test
import (
"context"
"fmt"
"math"
"testing"
"github.com/vertex-lab/crawler_v2/pkg/graph"
"github.com/vertex-lab/crawler_v2/pkg/pagerank"
"github.com/vertex-lab/crawler_v2/pkg/redb"
"github.com/vertex-lab/crawler_v2/pkg/walks"
test "github.com/vertex-lab/crawler_v2/tests/random"
"github.com/redis/go-redis/v9"
)
var ctx = context.Background()
// TestWalks will perform multiple iterations of the following:
// - fetch walks in batches
// - verify their consistency, meaning each node in a walk should contain its walk ID
func TestWalks(t *testing.T) {
fmt.Println("-----------------------------")
fmt.Println("Testing the walks consistency")
fmt.Printf("-----------------------------\n\n")
db := redb.New(&redis.Options{Addr: "localhost:6379"})
var iteration int
var limit int = 10000
var batch []walks.Walk
var cursor uint64
var err error
for {
iteration++
fmt.Printf("\033[1A")
fmt.Print("\033[J")
fmt.Printf("iteration %d...\n", iteration)
batch, cursor, err = db.ScanWalks(ctx, cursor, limit)
if err != nil {
t.Fatal(err)
}
pipe := db.Client.Pipeline()
cmds := make(map[string]*redis.BoolCmd)
for _, walk := range batch {
for _, node := range walk.Path {
// check that the walks visiting node contain this walk ID
key := string(node) + ":" + string(walk.ID)
cmds[key] = pipe.SIsMember(ctx, redb.KeyWalksVisitingPrefix+string(node), walk.ID)
}
}
if _, err := pipe.Exec(ctx); err != nil {
t.Fatalf("pipeline failed: %v", err)
}
for key, cmd := range cmds {
if !cmd.Val() {
t.Errorf("expected true, got %v: %v", cmd.Val(), key)
}
}
if cursor == 0 {
break
}
}
fmt.Println("passed!")
fmt.Println("-----------------------------")
}
// TestPagerank regenerate all walks for all active nodes to compute pagerank.
// The resulting distribution is compared with the one in Redis.
func TestPagerank(t *testing.T) {
fmt.Println("---------------------------------")
fmt.Println("Testing the pagerank distribution")
db := redb.New(&redis.Options{Addr: "localhost:6379"})
nodes, err := db.AllNodes(ctx)
if err != nil {
t.Fatal(err)
}
original, err := pagerank.Global(ctx, db, nodes...)
if err != nil {
t.Fatal(err)
}
fmt.Println(" > original pagerank computed")
// copy the db into a map to speed up random walks generation
followMap := make(map[graph.ID][]graph.ID, len(nodes))
for _, node := range nodes {
follows, err := db.Follows(ctx, node)
if err != nil {
t.Fatal(err)
}
followMap[node] = follows
}
walker := walks.NewSimpleWalker(followMap)
store := test.NewWalkStore()
fmt.Println(" > db copied")
fmt.Printf(" > generating walks...\n")
fmt.Printf("---------------------------------\n\n")
var active int
for i, ID := range nodes {
if i%1000 == 0 {
fmt.Printf("\033[1A")
fmt.Print("\033[J")
fmt.Printf("progress %d/%d...\n", i+1, len(nodes))
}
node, err := db.NodeByID(ctx, ID)
if err != nil {
t.Fatal(err)
}
if node.Status == graph.StatusActive {
walks, err := walks.Generate(ctx, walker, ID)
if err != nil {
t.Fatal(err)
}
store.AddWalks(walks)
active++
}
}
recomputed, err := pagerank.Global(ctx, store, nodes...)
if err != nil {
t.Fatal(err)
}
expected := expectedDistance(active, len(nodes))
distance := test.Distance(original, recomputed)
fmt.Printf("expected distance %f, got %f\n", expected, distance)
if distance > expected {
t.Fatalf("distance is higher than expected!")
}
fmt.Println("passed!")
fmt.Println("-----------------------------")
}
/*
ExpectedDistance between the real pagerank and the Monte-Carlo pagerank.
Such distance goes as ~N/sqrt(R), where N is the number of nodes and R is the number of walks.
# REFERENCES:
[1] K. Avrachenkov, N. Litvak, D. Nemirovsky, N. Osipova; "Monte Carlo methods in PageRank computation"
URL: https://www-sop.inria.fr/members/Konstantin.Avratchenkov/pubs/mc.pdf
*/
func expectedDistance(activeNodes, totalNodes int) float64 {
const errorConstant = 0.00035 // empirically derived
walks := float64(activeNodes * walks.N)
return errorConstant * float64(totalNodes) / math.Sqrt(walks)
}