crawler_v2/tests/e2e/e2e_test.go

package e2e_test

import (
	"context"
	"fmt"
	"math"
	"testing"

	"github.com/vertex-lab/crawler_v2/pkg/graph"
	"github.com/vertex-lab/crawler_v2/pkg/pagerank"
	"github.com/vertex-lab/crawler_v2/pkg/redb"
	"github.com/vertex-lab/crawler_v2/pkg/walks"
	test "github.com/vertex-lab/crawler_v2/tests/random"

	"github.com/redis/go-redis/v9"
)

var ctx = context.Background()

// TestWalks will perform multiple iterations of the following:
// - fetch walks in batches
// - verify their consistency, meaning each node in a walk should contain its walk ID
func TestWalks(t *testing.T) {
	fmt.Println("-----------------------------")
	fmt.Println("Testing the walks consistency")
	fmt.Printf("-----------------------------\n\n")

	db := redb.New(&redis.Options{Addr: "localhost:6379"})

	var iteration int
	var limit int = 10000

	var batch []walks.Walk
	var cursor uint64
	var err error

	for {
		iteration++
		fmt.Printf("\033[1A")
		fmt.Print("\033[J")
		fmt.Printf("iteration %d...\n", iteration)

		batch, cursor, err = db.ScanWalks(ctx, cursor, limit)
		if err != nil {
			t.Fatal(err)
		}

		pipe := db.Client.Pipeline()
		cmds := make(map[string]*redis.BoolCmd)

		for _, walk := range batch {
			for _, node := range walk.Path {
				// check that the walks visiting node contain this walk ID
				key := string(node) + ":" + string(walk.ID)
				cmds[key] = pipe.SIsMember(ctx, redb.KeyWalksVisitingPrefix+string(node), walk.ID)
			}
		}

		if _, err := pipe.Exec(ctx); err != nil {
			t.Fatalf("pipeline failed: %v", err)
		}

		for key, cmd := range cmds {
			if !cmd.Val() {
				t.Errorf("expected true, got %v: %v", cmd.Val(), key)
			}
		}

		if cursor == 0 {
			break
		}
	}

	fmt.Println("passed!")
	fmt.Println("-----------------------------")
}

// TestPagerank regenerate all walks for all active nodes to compute pagerank.
// The resulting distribution is compared with the one in Redis.
func TestPagerank(t *testing.T) {
	fmt.Println("---------------------------------")
	fmt.Println("Testing the pagerank distribution")

	db := redb.New(&redis.Options{Addr: "localhost:6379"})
	nodes, err := db.AllNodes(ctx)
	if err != nil {
		t.Fatal(err)
	}

	original, err := pagerank.Global(ctx, db, nodes...)
	if err != nil {
		t.Fatal(err)
	}
	fmt.Println(" > original pagerank computed")

	// copy the db into a map to speed up random walks generation
	followMap := make(map[graph.ID][]graph.ID, len(nodes))
	for _, node := range nodes {
		follows, err := db.Follows(ctx, node)
		if err != nil {
			t.Fatal(err)
		}

		followMap[node] = follows
	}

	walker := walks.NewSimpleWalker(followMap)
	store := test.NewWalkStore()

	fmt.Println(" > db copied")
	fmt.Printf(" > generating walks...\n")
	fmt.Printf("---------------------------------\n\n")

	var active int
	for i, ID := range nodes {
		if i%1000 == 0 {
			fmt.Printf("\033[1A")
			fmt.Print("\033[J")
			fmt.Printf("progress %d/%d...\n", i+1, len(nodes))
		}

		node, err := db.NodeByID(ctx, ID)
		if err != nil {
			t.Fatal(err)
		}

		if node.Status == graph.StatusActive {
			walks, err := walks.Generate(ctx, walker, ID)
			if err != nil {
				t.Fatal(err)
			}

			store.AddWalks(walks)
			active++
		}
	}

	recomputed, err := pagerank.Global(ctx, store, nodes...)
	if err != nil {
		t.Fatal(err)
	}

	expected := expectedDistance(active, len(nodes))
	distance := test.Distance(original, recomputed)
	fmt.Printf("expected distance %f, got %f\n", expected, distance)

	if distance > expected {
		t.Fatalf("distance is higher than expected!")
	}

	fmt.Println("passed!")
	fmt.Println("-----------------------------")
}

/*
ExpectedDistance between the real pagerank and the Monte-Carlo pagerank.
Such distance goes as ~N/sqrt(R), where N is the number of nodes and R is the number of walks.

# REFERENCES:
[1] K. Avrachenkov, N. Litvak, D. Nemirovsky, N. Osipova; "Monte Carlo methods in PageRank computation"
URL: https://www-sop.inria.fr/members/Konstantin.Avratchenkov/pubs/mc.pdf
*/
func expectedDistance(activeNodes, totalNodes int) float64 {
	const errorConstant = 0.00035 // empirically derived

	walks := float64(activeNodes * walks.N)
	return errorConstant * float64(totalNodes) / math.Sqrt(walks)
}