add wot_utils and adapted example notebook by @pippellia-btc

2026-02-05 06:14:22 +01:00 · 2024-07-30 12:12:10 +02:00
parent f91435f72c
commit 71873829d9
2 changed files with 992 additions and 0 deletions
--- a/nostr_dvm/utils/wot_utils.py
+++ b/nostr_dvm/utils/wot_utils.py
@@ -0,0 +1,589 @@
+import asyncio
+import os
+
+import nostr_sdk
+
+
+import networkx as nx
+
+# General
+import json
+import datetime
+import time
+import numpy as np
+import random
+from scipy.sparse import lil_matrix, csr_matrix, isspmatrix_csr
+
+
+from nostr_sdk import Options, Keys, NostrSigner, NostrDatabase, ClientBuilder, SecretKey, Kind, PublicKey
+
+from nostr_dvm.utils.dvmconfig import DVMConfig
+from nostr_dvm.utils.nostr_utils import check_and_set_private_key
+
+
+async def get_following(pks, max_time_request=10, newer_than_time=None):
+    '''
+        OUTPUT: following; a networkx graph
+        > each node has associated the timestamp of the latest retrivable kind3 event
+
+        > it's possible to limit the search to events newer_than_time
+
+        NOTE: do not get_following of more than 1000 pks;
+        instead, divide the request into batches of smaller size (500 recommended)
+    '''
+
+    # handling the case of a single key passed
+    if type(pks) == str:
+        pks = [pks]
+
+    # transforming into PubKey object type
+    list_pk = [nostr_sdk.PublicKey.parse(pk) for pk in pks]
+
+    # newer_than_time provided? If so, it only fetch events that are newer
+    if newer_than_time is None:
+        filter = nostr_sdk.Filter().authors(list_pk).kind(Kind(3))
+
+    else:
+        newer_than_time = round(newer_than_time)
+        ts = nostr_sdk.Timestamp().from_secs(newer_than_time)
+        filter = nostr_sdk.Filter().authors(list_pk).kind(3).since(ts)
+
+    # fetching events
+    opts = (Options().wait_for_send(False).send_timeout(datetime.timedelta(seconds=5)))
+    keys = Keys.parse(check_and_set_private_key("test_client"))
+    signer = NostrSigner.keys(keys)
+    cli = ClientBuilder().signer(signer).opts(opts).build()
+
+    for relay in DVMConfig.RECONCILE_DB_RELAY_LIST:
+        await cli.add_relay(relay)
+
+    await cli.connect()
+
+    events = await cli.get_events_of([filter], datetime.timedelta(seconds=max_time_request))
+
+    # initializing the graph structure
+    following = nx.DiGraph()
+    following.add_nodes_from(pks)
+
+    if events == []:
+        return following
+
+    for event in events:
+
+        author = event.author().to_hex()
+
+        # events are returned based on the timestamp,
+        # the first event for each pubkey is the most recent = the one we should use
+
+        if event.verify() and author in following.nodes() and 'timestamp' not in following.nodes[author]:
+            # updating the nodes and edges
+            nodes = event.public_keys()
+
+            # converting to hex and removing self-following
+            nodes = [pk.to_hex() for pk in nodes if pk.to_hex() != author]
+
+            following.update(edges=[(author, pk) for pk in nodes], nodes=nodes)
+
+            # updating the timestamp
+            tp = event.created_at().as_secs()
+            following.nodes[author]['timestamp'] = tp
+
+    return following
+
+
+async def build_network_from(seed_pks, depth=2, max_batch=500, max_time_request=10):
+    if not seed_pks:
+        print('Error: seed_pks cannot be empty')
+        return
+
+    if depth < 1:
+        print('Error: depth cannot be lower than 1')
+        return
+
+        # handling the case of a single key being passed
+    if type(seed_pks) == str:
+        seed_pks = [seed_pks]
+
+    print('Step 1: fetching kind 3 events from relays & pre-processing')
+
+    tic = time.time()
+
+    # initialize the index_map
+    index_map = {pk: i for i, pk in enumerate(seed_pks)}
+
+    # initialize the graph
+    seed_graph = nx.DiGraph()
+    seed_graph.add_nodes_from(index_map.values())
+
+    # build the network internal function
+    index_map, network_graph = await _build_network_from(index_map, seed_graph, set(), depth, max_batch,
+                                                         max_time_request)
+
+    toc = time.time()
+
+    print('\nFinished in ' + str(toc - tic))
+
+    return index_map, network_graph
+
+
+async def _build_network_from(index_map, network_graph, visited_pk=None, depth=2, max_batch=500, max_time_request=10):
+    '''
+        OUTPUTS:
+
+        1. index_map = {pk0 : 0, pk1 : 1, ... }
+        > an index that translates pub keys in hex format into nodes in the graph
+
+        2. network_graph; a networkx directed graph about who follows who
+        > each node has associated the timestamp of the latest retrivable kind3 event
+        > if the node is a leaf or doesn't follow anyone, it has no attribute 'timestamp'
+
+    '''
+
+    # pks to be visited next, splitted in batches
+    if visited_pk is None:
+        visited_pk = set()
+    to_visit_pk = split_set(index_map.keys() - visited_pk, max_batch)
+
+    for pks in to_visit_pk:
+        # getting the followings as a graph
+        following = await get_following(pks, max_time_request)
+
+        # update the visited_pk
+        visited_pk.update(pks)
+
+        # add the new pub_keys to the index_map
+        index_map = _extend_index_map(index_map, following)
+
+        # re-lable nodes using the index_map to use storage more efficiently
+        nx.relabel_nodes(following, index_map, copy=False)
+
+        # extend the network graph
+        network_graph.update(following)
+
+        print('current network: ' + str(len(index_map)) + ' npubs', end='\r')
+
+    if depth == 1:
+        return index_map, network_graph
+
+    else:
+
+        # recursive call
+        index_map, network_graph = await _build_network_from(index_map, network_graph, visited_pk, depth - 1, max_batch,
+                                                             max_time_request)
+
+    print('current network: ' + str(len(network_graph.nodes())) + ' npubs', end='\r')
+
+    return index_map, network_graph
+
+
+def _extend_index_map(index_map, following):
+    '''
+        index_map = { pk0: 0, pk1:1, ...}
+    '''
+
+    # getting all new pubkeys
+    new_pk = following.nodes() - index_map.keys()
+
+    # start assigning new indices from the next available index
+    start_index = len(index_map)
+
+    # Update the index_map with new keys and their assigned indices
+    for i, pk in enumerate(new_pk, start=start_index):
+        index_map[pk] = i
+
+    return index_map
+
+
+def split_set(my_set, max_batch):
+    my_list = list(my_set)
+    return [set(my_list[x: x + max_batch]) for x in range(0, len(my_set), max_batch)]
+
+
+def save_network(index_map, network_graph, name=None):
+    if name == None:
+        # adding unix time to file name to avoid replacing an existing file
+        name = str(round(time.time()))
+    #filename = os.path.join('/cache/', 'index_map_' + name + '.json')
+    filename = 'index_map_' + name + '.json'
+    # saving the index_map as a json file
+    with open(filename, 'w') as f:
+        json.dump(index_map, f, indent=4)
+
+    # Convert to node-link format suitable for JSON
+    data = nx.node_link_data(network_graph)
+
+    # saving the network_graph as a json file
+    #filename = os.path.join('/cache/', 'network_graph_' + name + '.json')
+    filename = 'network_graph_' + name + '.json'
+    with open(filename, 'w') as f:
+        json.dump(data, f)
+
+    print(' > index_map_' + name + '.json')
+    print(' > network_graph_' + name + '.json')
+
+    return
+
+
+def load_network(name):
+    if type(name) != str:
+        name = str(name)
+
+    # loading the index_map
+    with open('index_map_' + name + '.json', 'r') as f:
+        index_map = json.load(f)
+
+    # loading the JSON for the graph
+    with open('network_graph_' + name + '.json', 'r') as f:
+        data = json.load(f)
+
+    # Convert JSON back to graph
+    network_graph = nx.node_link_graph(data)
+
+    return index_map, network_graph
+
+
+def get_mc_pagerank(G, R, nodelist=None, alpha=0.85):
+    '''
+        Monte-Carlo complete path stopping at dandling nodes
+
+        INPUTS
+        ------
+        G: graph
+            A directed Networkx graph. This function cannot work on directed graphs.
+
+        R: int
+            The number of random walks to be performed per node
+
+        nodelist: list, optional
+            the list of nodes in G networkx graph.
+            It is used to order the nodes in a specified way
+
+        alpha: float, optional
+            It is the dampening factor of Pagerank. default value is 0.85
+
+        OUTPUTS
+        -------
+        walk_visited_count: CSR matrix
+            a Compressed Sparse Row (CSR) matrix; element (i,j) is equal to
+            the number of times v_j has been visited by a random walk started from v_i
+
+        mc_pagerank: dict
+            The dictionary {node: pg} of the pagerank value for each node in G
+
+        References
+        ----------
+        [1] K.Avrachenkov, N. Litvak, D. Nemirovsky, N. Osipova
+        "Monte Carlo methods in PageRank computation: When one iteration is sufficient"
+        https://www-sop.inria.fr/members/Konstantin.Avratchenkov/pubs/mc.pdf
+    '''
+
+    # validate all the inputs and initialize variables
+    N, nodelist, inverse_nodelist = _validate_inputs_and_init_mc(G, R, nodelist, alpha)
+
+    # initialize walk_visited_count as a sparse LIL matrix
+    walk_visited_count = lil_matrix((N, N), dtype='int')
+
+    progress_count = 0
+
+    # perform R random walks for each node
+    for node in nodelist:
+
+        # print progress every 200 nodes
+        progress_count += 1
+        if progress_count % 200 == 0:
+            print('progress = {:.2f}%'.format(100 * progress_count / N), end='\r')
+
+        for _ in range(R):
+
+            node_pos = inverse_nodelist[node]
+            walk_visited_count[node_pos, node_pos] += 1
+
+            current_node = node
+
+            while random.uniform(0, 1) < alpha:
+
+                successors = list(G.successors(current_node))
+                if not successors:
+                    break
+
+                current_node = random.choice(successors)
+                current_node_pos = inverse_nodelist[current_node]
+
+                # add current node to the walk_visited_count
+                walk_visited_count[node_pos, current_node_pos] += 1
+
+    # convert lil_matrix to csr_matrix for efficient storage and access
+    walk_visited_count = walk_visited_count.tocsr()
+
+    # sum all visits for each node into a numpy array
+    total_visited_count = np.array(walk_visited_count.sum(axis=0)).flatten()
+
+    # reciprocal of the number of total visits
+    one_over_s = 1 / sum(total_visited_count)
+
+    mc_pagerank = {nodelist[j]: total_visited_count[j] * one_over_s for j in range(N)}
+
+    print('progress = 100%       ', end='\r')
+    print('\nTotal walks performed: ', N * R)
+
+    return walk_visited_count, mc_pagerank
+
+
+def _validate_inputs_and_init_mc(G, R, nodelist, alpha):
+    '''
+    This function validate the inputs and initialize the following variables:
+
+    N: int
+        the number of nodes in G Networkx graph
+
+    nodelist : list
+        the list of nodes in G Networkx graph
+
+    inverse_nodelist : dict
+       a dictionary that maps each node in G to its position in nodelist
+    '''
+
+    N = len(G)
+    if N == 0:
+        raise ValueError("Graph G is empty")
+
+    if not isinstance(R, int) or R <= 0:
+        raise ValueError("R must be a positive integer")
+
+    if not isinstance(alpha, float) or not (0 < alpha < 1):
+        raise ValueError("alpha must be a float between 0 and 1")
+
+    if nodelist is not None and set(nodelist) != set(G.nodes()):
+        raise ValueError("nodelist does not match the nodes in G")
+
+    elif nodelist is None:
+        nodelist = list(G.nodes())
+
+    # compute the inverse map of nodelist
+    inverse_nodelist = {nodelist[j]: j for j in range(N)}
+
+    return N, nodelist, inverse_nodelist
+
+
+def get_subrank(S, G, walk_visited_count, nodelist, alpha=0.85):
+    '''
+        Subrank algorithm (stopping at dandling nodes);
+        it aims to approximate the Pagerank over S subgraph of G
+
+        INPUTS
+        ------
+        S: graph
+            A directed Networkx graph, induced subgraph of G
+
+        G: graph
+            A directed Networkx graph. This function cannot work on directed graphs.
+
+        walk_visited_count: CSR matrix
+            a Compressed Sparse Row (CSR) matrix; element (i,j) is equal to
+            the number of times v_j has been visited by a random walk started from v_i
+
+        nodelist: list, optional
+            the list of nodes in G Networkx graph. It is used to decode walk_visited_count
+
+       alpha: float, optional
+            It is the dampening factor of Pagerank. default value is 0.85
+
+        OUTPUTS
+        -------
+        subrank: dict
+            The dictionary {node: pg} of the pagerank value for each node in S
+
+        References
+        ----------
+        [1] Pippellia,
+        "Pagerank on subgraphs—efficient Monte-Carlo estimation"
+        https://pippellia.com/pippellia/Social+Graph/Pagerank+on+subgraphs%E2%80%94efficient+Monte-Carlo+estimation
+    '''
+
+    # validate inputs and initialize variables
+    N, S_nodes, G_nodes, inverse_nodelist = _validate_inputs_and_init(S, G, walk_visited_count, nodelist, alpha)
+
+    # compute visited count from walks that started from S
+    visited_count_from_S = _get_visited_count_from_S(N, S_nodes, walk_visited_count, nodelist, inverse_nodelist)
+
+    # compute positive and negative walks to do
+    positive_walks, negative_walks = _get_walks_to_do(S_nodes, G_nodes, S, G, visited_count_from_S, alpha)
+
+    print(f'walks performed = {sum(positive_walks.values()) + sum(negative_walks.values())}')
+
+    # perform the walks and get the visited counts
+    positive_count = _perform_walks(S_nodes, S, positive_walks, alpha)
+    negative_count = _perform_walks(S_nodes, S, negative_walks, alpha)
+
+    # add the effects of the random walk to the count of G
+    new_visited_count = {node: visited_count_from_S[node] + positive_count[node] - negative_count[node]
+                         for node in S_nodes}
+
+    # compute the number of total visits
+    total_visits = sum(new_visited_count.values())
+
+    # compute the subrank
+    subrank = {node: visits / total_visits
+               for node, visits in new_visited_count.items()}
+
+    return subrank
+
+
+def _validate_inputs_and_init(S, G, walk_visited_count, nodelist, alpha):
+    '''
+    This function validate the inputs and initialize the following variables:
+
+    N: int
+        the number of nodes in G Networkx graph
+
+    S_nodes: set
+        the set of nodes that belongs to S
+
+    G_nodes: set
+        the set of nodes that belongs to G
+
+    inverse_nodelist : dict
+        a dictionary that maps each node in G to its position in nodelist
+
+    Note: S being a subgraph of G is NOT checked because it's computationally expensive.
+    '''
+
+    if len(S) == 0:
+        raise ValueError("graph S is empty")
+
+    N = len(G)
+    if N == 0:
+        raise ValueError("graph G is empty")
+
+    if not isinstance(alpha, float) or not (0 < alpha < 1):
+        raise ValueError("alpha must be a float between 0 and 1")
+
+    if not isspmatrix_csr(walk_visited_count) or walk_visited_count.shape != (N, N):
+        raise ValueError(f"walk_visited_count must be a {(N, N)} CSR matrix")
+
+    S_nodes = set(S.nodes())
+    G_nodes = set(G.nodes())
+
+    if not nodelist or set(nodelist) != set(G_nodes):
+        raise ValueError("nodelist does not match the nodes in G")
+
+    # compute the inverse map of nodelist
+    inverse_nodelist = {nodelist[j]: j for j in range(N)}
+
+    return N, S_nodes, G_nodes, inverse_nodelist
+
+
+def _get_visited_count_from_S(N, S_nodes, walk_visited_count, nodelist, inverse_nodelist):
+    '''
+    This function extracts the number of visits that come from walks that started from S
+    '''
+
+    # getting the indices of nodes in S
+    S_indices = [inverse_nodelist[node] for node in S_nodes]
+
+    # Extract the rows
+    S_matrix = walk_visited_count[S_indices, :]
+
+    # Sum the rows
+    visited_count_from_S = np.array(S_matrix.sum(axis=0)).flatten()
+
+    # convert to a dictionary
+    visited_count_from_S = {nodelist[j]: visited_count_from_S[j] for j in range(N)}
+
+    return visited_count_from_S
+
+
+def _get_walks_to_do(S_nodes, G_nodes, S, G, visited_count_from_S, alpha):
+    '''
+    This function calculates the positive and negative walks to be done for each node.
+    It is a necessary step to take into account the different structure of S
+    with respect to that of G.
+    '''
+
+    # compute nodes in G-S
+    external_nodes = G_nodes - S_nodes
+
+    # compute nodes in S that point to G-S
+    nodes_that_point_externally = {u for u, v in nx.edge_boundary(G, S_nodes, external_nodes)}
+
+    walks_to_do = {node: 0 for node in S_nodes}
+
+    # add positive random walks to walks_to_do
+    for node in nodes_that_point_externally:
+
+        successors = set(G.successors(node)) & S_nodes
+
+        if successors:
+
+            # compute estimate visits
+            visited_count = visited_count_from_S[node]
+            degree_S = S.out_degree(node)
+            degree_G = G.out_degree(node)
+            estimate_visits = alpha * visited_count * (1 / degree_S - 1 / degree_G)
+
+            for succ in successors:
+                walks_to_do[succ] += estimate_visits
+
+    # subtract number of negative random walks
+    for node in external_nodes:
+
+        successors = set(G.successors(node)) & S_nodes
+
+        if successors:
+
+            # compute estimate visits
+            visited_count = visited_count_from_S[node]
+            degree = G.out_degree(node)
+            estimate_visits = alpha * visited_count / degree
+
+            for succ in successors:
+                walks_to_do[succ] -= estimate_visits
+
+    # split the walks to do into positive and negative
+    positive_walks_to_do = {node: round(value) for node, value in walks_to_do.items() if value > 0}
+    negative_walks_to_do = {node: round(-value) for node, value in walks_to_do.items() if value < 0}
+
+    return positive_walks_to_do, negative_walks_to_do
+
+
+def _perform_walks(S_nodes, S, walks_to_do, alpha):
+    '''
+    This function performs a certain number of random walks on S for each node;
+    It then returns the visited count for each node in S.
+    '''
+
+    # initializing the visited count
+    visited_count = {node: 0 for node in S_nodes}
+
+    for starting_node in walks_to_do.keys():
+
+        num = walks_to_do[starting_node]
+
+        # performing num random walks
+        for _ in range(num):
+
+            current_node = starting_node
+            visited_count[current_node] += 1
+
+            # performing one random walk
+            while random.uniform(0, 1) < alpha:
+
+                successors = list(S.successors(current_node))
+
+                if not successors:
+                    break
+
+                current_node = random.choice(successors)
+
+                # updating the visited count
+                visited_count[current_node] += 1
+
+    return visited_count
+
+
+
+def test():
+    # WARNING, DEPENDING ON DEPTH THIS TAKES LONG
+    user = '3bf0c63fcb93463407af97a5e5ee64fa883d107ef9e558472c4eb9aaaefa459d'
+    index_map, network_graph = asyncio.run(build_network_from(user, depth=2, max_batch=500, max_time_request=10))
+    save_network(index_map, network_graph, user)
--- a/tests/pagerank/subrank_demo_notebook.ipynb
+++ b/tests/pagerank/subrank_demo_notebook.ipynb
@@ -0,0 +1,403 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c1935f9d-02db-468c-ade3-87f164b6d4f6",
+   "metadata": {},
+   "source": [
+    "# Pagerank on subgraphs—efficient Monte-Carlo estimation\n",
+    "\n",
+    "In this repo you can find the reference code for my novel Subrank algorithm for efficiently computing the Pagerank distribution over $S$ subgraph of $G$.\n",
+    "For the reasoning behind the algorithm, the definition and the analysis, I invite the interested reader to [read the paper](https://pippellia.com/pippellia/Social+Graph/Pagerank+on+subgraphs%E2%80%94efficient+Monte-Carlo+estimation).\n",
+    "\n",
+    "To play with it, follow these steps:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bcce6ba6-2539-4bd3-85c8-608eb83bfeb3",
+   "metadata": {},
+   "source": [
+    "## Step 0: Build and store the Graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "from nostr_dvm.utils.wot_utils import build_network_from, save_network, load_network, get_mc_pagerank, get_subrank\n",
+    "import time\n",
+    "import networkx as nx\n",
+    "import random\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:08:53.401249Z",
+     "start_time": "2024-07-30T10:08:53.392291Z"
+    }
+   },
+   "id": "5ac404375ef61608"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "outputs": [],
+   "source": [
+    "user = '99bb5591c9116600f845107d31f9b59e2f7c7e09a1ff802e84f1d43da557ca64'\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:08:55.064034Z",
+     "start_time": "2024-07-30T10:08:55.058138Z"
+    }
+   },
+   "id": "8d89d517de8b506e"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step 1: fetching kind 3 events from relays & pre-processing\n",
+      "current network: 44029 npubs\r\n",
+      "Finished in 39.349228858947754\n",
+      " > index_map_99bb5591c9116600f845107d31f9b59e2f7c7e09a1ff802e84f1d43da557ca64.json\n",
+      " > network_graph_99bb5591c9116600f845107d31f9b59e2f7c7e09a1ff802e84f1d43da557ca64.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "index_map, network_graph = await build_network_from(user, depth=2, max_batch=500, max_time_request=10)\n",
+    "save_network(index_map, network_graph, user)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:09:36.764366Z",
+     "start_time": "2024-07-30T10:08:56.071082Z"
+    }
+   },
+   "id": "c5de7bf0bac361ff"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c68b3da4-39c0-4fb0-8687-f08aea70f25d",
+   "metadata": {},
+   "source": [
+    "## Step 1: load the graph database\n",
+    "\n",
+    "First, you have to load the networkx graph database into memory by running the following code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "f5a48d91-80e1-4016-8030-9ac9ef0ab1af",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:09:48.642929Z",
+     "start_time": "2024-07-30T10:09:47.534981Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading the database...\n",
+      "finished in 1.1044838428497314 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# loading the database\n",
+    "print('loading the database...')\n",
+    "tic = time.time()\n",
+    "\n",
+    "index_map, G = load_network(user)\n",
+    "\n",
+    "toc = time.time()\n",
+    "print(f'finished in {toc-tic} seconds')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33a1dc28-5af4-4097-9987-103520588f81",
+   "metadata": {},
+   "source": [
+    "## Step 2: Compute Pagerank over $G$\n",
+    "\n",
+    "Compute the pagerank over $G$ by using the networkx built-in pagerank function that uses the power iteration method.\n",
+    "This vector will be considered as the real Pagerank vector and will be used to compute the errors of the Monte-Carlo algorithm."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "bc5a4f83-d5b9-4def-a7f7-403cd0beedfe",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:09:52.561819Z",
+     "start_time": "2024-07-30T10:09:51.533522Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "computing global pagerank...\n",
+      "finished in 1.036012887954712 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# computing the pagerank\n",
+    "print('computing global pagerank...')\n",
+    "tic = time.time()\n",
+    "\n",
+    "p_G = nx.pagerank(G, tol=1e-12)\n",
+    "\n",
+    "toc = time.time()\n",
+    "print(f'finished in {toc-tic} seconds')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50552e62-d66d-4a02-b49f-33c299e8311b",
+   "metadata": {},
+   "source": [
+    "## Step 3: Approximate Pagerank over $G$ using Monte-Carlo\n",
+    "\n",
+    "Compute the pagerank over $G$ using a simple Monte-Carlo implementation and compute the L1 error.\n",
+    "This step is essential because it returns the csr-matrix `walk_visited_count`, that will be used later by the Subrank algorithm."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "68bbe6d2-c19a-4a84-8ba0-13ae2b6e13d6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:09:57.331397Z",
+     "start_time": "2024-07-30T10:09:56.433680Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "progress = 100%       \r\n",
+      "Total walks performed:  440290\n",
+      "performed random walks in 1.0430760383605957 seconds\n",
+      "error pagerank vs mc pagerank in G = 0.020183034486847724\n"
+     ]
+    }
+   ],
+   "source": [
+    "# number of the random walks per node\n",
+    "R = 10\n",
+    "\n",
+    "# fix the order of the nodes\n",
+    "nodelist = list(G.nodes())\n",
+    "\n",
+    "tic = time.time()\n",
+    "\n",
+    "# perform the random walks and get the monte-carlo pagerank\n",
+    "walk_visited_count, mc_pagerank = get_mc_pagerank(G, R, nodelist)\n",
+    "\n",
+    "toc = time.time()\n",
+    "print(f'performed random walks in {toc-tic} seconds')\n",
+    "\n",
+    "# computing the L1 error\n",
+    "error_G_mc = sum( abs(p_G[node] - mc_pagerank[node])\n",
+    "                  for node in G.nodes() )\n",
+    "\n",
+    "print(f'error pagerank vs mc pagerank in G = {error_G_mc}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7a3bf52-7841-4842-a3f2-6f6d066e0c94",
+   "metadata": {},
+   "source": [
+    "## Step 4: Select random subgraph $S$ and compute its Pagerank distribution\n",
+    "\n",
+    "Select a random subgraph $S$ consisting of 50k nodes, and compute its Pagerank distribution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c00c005e-2064-4975-acb6-3bcf52eb8594",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:10:05.324793Z",
+     "start_time": "2024-07-30T10:10:05.306277Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "computing local pagerank...\n",
+      "finished in 0.0029449462890625 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# selecting random subgraph S\n",
+    "S_nodes = set(random.sample(list(G.nodes()), k=500)) #50000\n",
+    "S = G.subgraph(S_nodes).copy()\n",
+    "\n",
+    "# computing pagerank over S\n",
+    "print('computing local pagerank...')\n",
+    "tic = time.time()\n",
+    "\n",
+    "p_S = nx.pagerank(S, tol=1e-12)\n",
+    "\n",
+    "toc = time.time()\n",
+    "print(f'finished in {toc-tic} seconds')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8eb0ae95-87c1-4f5d-890f-42ff5892d881",
+   "metadata": {},
+   "source": [
+    "## Step 5: Approximate Pagerank over $S$ using Subrank\n",
+    "\n",
+    "Run the Subrank algorithm to approximate the Pagerank over $S$ subgraph of $G$. Then compute the L1 error."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "41637e2b-1998-43ff-9811-a7bbe658d742",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:10:07.197429Z",
+     "start_time": "2024-07-30T10:10:07.109108Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "computing subrank over S...\n",
+      "walks performed = 157\n",
+      "performed random walks in 0.05398416519165039 seconds\n",
+      "error pagerank vs subrank in S = 0.017882599344811713\n"
+     ]
+    }
+   ],
+   "source": [
+    "# computing subrank\n",
+    "print('computing subrank over S...')\n",
+    "tic = time.time()\n",
+    "\n",
+    "subrank = get_subrank(S, G, walk_visited_count, nodelist)\n",
+    "\n",
+    "toc = time.time()\n",
+    "print(f'performed random walks in {toc-tic} seconds')\n",
+    "\n",
+    "# computing the L1 error\n",
+    "error_S_subrank = sum( abs(p_S[node] - subrank[node])\n",
+    "                      for node in S_nodes )\n",
+    "\n",
+    "print(f'error pagerank vs subrank in S = {error_S_subrank}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a929d6dc-37ef-4a4a-bc9c-c37bdafd012a",
+   "metadata": {},
+   "source": [
+    "## Step 6: Approximate Pagerank over $S$ using Monte-Carlo naive recomputation\n",
+    "\n",
+    "Run the Monte-Carlo Pagerank algorithm on $S$ as a reference for the number of random walks required and the error achieved."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "14907960-116b-405d-9f73-85e625958050",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-30T10:10:09.835445Z",
+     "start_time": "2024-07-30T10:10:09.828723Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "computing naive monte-carlo pagerank over S\n",
+      "progress = 100%       \r\n",
+      "Total walks performed:  5000\n",
+      "finished in 0.010932207107543945 seconds\n",
+      "error pagerank vs mc pagerank in S = 0.020285385444477645\n"
+     ]
+    }
+   ],
+   "source": [
+    "# computing the monte-carlo pagerank \n",
+    "print('computing naive monte-carlo pagerank over S')\n",
+    "tic = time.time()\n",
+    "\n",
+    "_, mc_pagerank_S_naive = get_mc_pagerank(S,R)\n",
+    "\n",
+    "toc = time.time()\n",
+    "print(f'finished in {toc-tic} seconds')\n",
+    "\n",
+    "# computing the L1 error\n",
+    "error_S_naive = sum( abs(p_S[node] - mc_pagerank_S_naive[node])\n",
+    "                      for node in S.nodes())\n",
+    "\n",
+    "print(f'error pagerank vs mc pagerank in S = {error_S_naive}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "9387d4e26e992252"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}