mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-18 03:44:20 +01:00
421 lines
15 KiB
Python
421 lines
15 KiB
Python
"""Graph analyzer module - pure graph algorithms for dependency and call graphs.
|
|
|
|
This module provides ONLY non-interpretive graph algorithms:
|
|
- Cycle detection (DFS)
|
|
- Shortest path finding (BFS)
|
|
- Layer identification (topological sort)
|
|
- Impact analysis (graph traversal)
|
|
- Statistical summaries (counts and grouping)
|
|
|
|
For interpretive metrics like health scores, recommendations, and weighted
|
|
rankings, see the optional graph.insights module.
|
|
"""
|
|
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
class XGraphAnalyzer:
|
|
"""Analyze cross-project dependency and call graphs using pure algorithms."""
|
|
|
|
def detect_cycles(self, graph: dict[str, Any]) -> list[dict[str, Any]]:
|
|
"""
|
|
Detect cycles in the dependency graph using DFS.
|
|
|
|
This is a pure graph algorithm that returns raw cycle data
|
|
without any interpretation or scoring.
|
|
|
|
Args:
|
|
graph: Graph with 'nodes' and 'edges' keys
|
|
|
|
Returns:
|
|
List of cycles, each with nodes and size
|
|
"""
|
|
# Build adjacency list
|
|
adj = defaultdict(list)
|
|
for edge in graph.get("edges", []):
|
|
adj[edge["source"]].append(edge["target"])
|
|
|
|
# Track visited nodes and recursion stack
|
|
visited = set()
|
|
rec_stack = set()
|
|
cycles = []
|
|
|
|
def dfs(node: str, path: list[str]) -> None:
|
|
"""DFS to detect cycles."""
|
|
visited.add(node)
|
|
rec_stack.add(node)
|
|
path.append(node)
|
|
|
|
for neighbor in adj[node]:
|
|
if neighbor not in visited:
|
|
dfs(neighbor, path.copy())
|
|
elif neighbor in rec_stack:
|
|
# Found a cycle
|
|
cycle_start = path.index(neighbor)
|
|
cycle_nodes = path[cycle_start:] + [neighbor]
|
|
cycles.append({
|
|
"nodes": cycle_nodes,
|
|
"size": len(cycle_nodes) - 1, # Don't count repeated node
|
|
})
|
|
|
|
rec_stack.remove(node)
|
|
|
|
# Run DFS from all unvisited nodes
|
|
for node in graph.get("nodes", []):
|
|
node_id = node["id"]
|
|
if node_id not in visited:
|
|
dfs(node_id, [])
|
|
|
|
# Sort cycles by size (largest first)
|
|
cycles.sort(key=lambda c: c["size"], reverse=True)
|
|
|
|
return cycles
|
|
|
|
def impact_of_change(
|
|
self,
|
|
targets: list[str],
|
|
import_graph: dict[str, Any],
|
|
call_graph: dict[str, Any] | None = None,
|
|
max_depth: int = 3,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Calculate the impact of changing target files using graph traversal.
|
|
|
|
This is a pure graph algorithm that finds affected nodes
|
|
without interpreting or scoring the impact.
|
|
|
|
Args:
|
|
targets: List of file/module IDs that will change
|
|
import_graph: Import/dependency graph
|
|
call_graph: Optional call graph
|
|
max_depth: Maximum traversal depth
|
|
|
|
Returns:
|
|
Raw impact data with upstream and downstream effects
|
|
"""
|
|
# Build adjacency lists
|
|
upstream = defaultdict(list) # Who depends on X
|
|
downstream = defaultdict(list) # What X depends on
|
|
|
|
for edge in import_graph.get("edges", []):
|
|
downstream[edge["source"]].append(edge["target"])
|
|
upstream[edge["target"]].append(edge["source"])
|
|
|
|
if call_graph:
|
|
for edge in call_graph.get("edges", []):
|
|
downstream[edge["source"]].append(edge["target"])
|
|
upstream[edge["target"]].append(edge["source"])
|
|
|
|
# Find upstream impact (what depends on targets)
|
|
upstream_impact = set()
|
|
to_visit = [(t, 0) for t in targets]
|
|
visited = set()
|
|
|
|
while to_visit:
|
|
node, depth = to_visit.pop(0)
|
|
if node in visited or depth >= max_depth:
|
|
continue
|
|
visited.add(node)
|
|
|
|
for dependent in upstream[node]:
|
|
upstream_impact.add(dependent)
|
|
to_visit.append((dependent, depth + 1))
|
|
|
|
# Find downstream impact (what targets depend on)
|
|
downstream_impact = set()
|
|
to_visit = [(t, 0) for t in targets]
|
|
visited = set()
|
|
|
|
while to_visit:
|
|
node, depth = to_visit.pop(0)
|
|
if node in visited or depth >= max_depth:
|
|
continue
|
|
visited.add(node)
|
|
|
|
for dependency in downstream[node]:
|
|
downstream_impact.add(dependency)
|
|
to_visit.append((dependency, depth + 1))
|
|
|
|
# Return raw counts without ratios or interpretations
|
|
all_impacted = set(targets) | upstream_impact | downstream_impact
|
|
|
|
return {
|
|
"targets": targets,
|
|
"upstream": sorted(upstream_impact),
|
|
"downstream": sorted(downstream_impact),
|
|
"total_impacted": len(all_impacted),
|
|
"graph_nodes": len(import_graph.get("nodes", [])),
|
|
}
|
|
|
|
def find_shortest_path(
|
|
self,
|
|
source: str,
|
|
target: str,
|
|
graph: dict[str, Any]
|
|
) -> list[str] | None:
|
|
"""
|
|
Find shortest path between two nodes using BFS.
|
|
|
|
Pure pathfinding algorithm without interpretation.
|
|
|
|
Args:
|
|
source: Source node ID
|
|
target: Target node ID
|
|
graph: Graph with edges
|
|
|
|
Returns:
|
|
List of node IDs forming the path, or None if no path exists
|
|
"""
|
|
# Build adjacency list
|
|
adj = defaultdict(list)
|
|
for edge in graph.get("edges", []):
|
|
adj[edge["source"]].append(edge["target"])
|
|
|
|
# BFS
|
|
queue = [(source, [source])]
|
|
visited = {source}
|
|
|
|
while queue:
|
|
node, path = queue.pop(0)
|
|
|
|
if node == target:
|
|
return path
|
|
|
|
for neighbor in adj[node]:
|
|
if neighbor not in visited:
|
|
visited.add(neighbor)
|
|
queue.append((neighbor, path + [neighbor]))
|
|
|
|
return None
|
|
|
|
def identify_layers(self, graph: dict[str, Any]) -> dict[str, list[str]]:
|
|
"""
|
|
Identify architectural layers using topological sorting.
|
|
|
|
Pure graph layering algorithm without interpretation.
|
|
|
|
Args:
|
|
graph: Import/dependency graph
|
|
|
|
Returns:
|
|
Dict mapping layer number to list of node IDs
|
|
"""
|
|
# Calculate in-degrees
|
|
in_degree = defaultdict(int)
|
|
nodes = {node["id"] for node in graph.get("nodes", [])}
|
|
|
|
for edge in graph.get("edges", []):
|
|
in_degree[edge["target"]] += 1
|
|
|
|
# Find nodes with no dependencies (layer 0)
|
|
layers = {}
|
|
current_layer = []
|
|
|
|
for node_id in nodes:
|
|
if in_degree[node_id] == 0:
|
|
current_layer.append(node_id)
|
|
|
|
# Build layers using modified topological sort
|
|
layer_num = 0
|
|
adj = defaultdict(list)
|
|
|
|
for edge in graph.get("edges", []):
|
|
adj[edge["source"]].append(edge["target"])
|
|
|
|
while current_layer:
|
|
layers[layer_num] = current_layer
|
|
next_layer = []
|
|
|
|
for node in current_layer:
|
|
for neighbor in adj[node]:
|
|
in_degree[neighbor] -= 1
|
|
if in_degree[neighbor] == 0:
|
|
next_layer.append(neighbor)
|
|
|
|
current_layer = next_layer
|
|
layer_num += 1
|
|
|
|
return layers
|
|
|
|
def get_graph_summary(self, graph_data: dict[str, Any]) -> dict[str, Any]:
|
|
"""
|
|
Extract basic statistics from a graph without interpretation.
|
|
|
|
This method provides raw counts and statistics only,
|
|
no subjective metrics or labels.
|
|
|
|
Args:
|
|
graph_data: Large graph dict with 'nodes' and 'edges'
|
|
|
|
Returns:
|
|
Concise summary with raw statistics only
|
|
"""
|
|
# Basic statistics
|
|
nodes = graph_data.get("nodes", [])
|
|
edges = graph_data.get("edges", [])
|
|
|
|
# Calculate in/out degrees
|
|
in_degree = defaultdict(int)
|
|
out_degree = defaultdict(int)
|
|
for edge in edges:
|
|
out_degree[edge["source"]] += 1
|
|
in_degree[edge["target"]] += 1
|
|
|
|
# Find most connected nodes (raw data only)
|
|
connection_counts = []
|
|
for node in nodes: # Process all nodes
|
|
node_id = node["id"]
|
|
total = in_degree[node_id] + out_degree[node_id]
|
|
if total > 0:
|
|
connection_counts.append({
|
|
"id": node_id,
|
|
"in_degree": in_degree[node_id],
|
|
"out_degree": out_degree[node_id],
|
|
"total_connections": total
|
|
})
|
|
|
|
# Sort and get top 10
|
|
connection_counts.sort(key=lambda x: x["total_connections"], reverse=True)
|
|
top_connected = connection_counts[:10]
|
|
|
|
# Detect cycles (complete search)
|
|
cycles = self.detect_cycles({"nodes": nodes, "edges": edges})
|
|
|
|
# Calculate graph metrics
|
|
node_count = len(nodes)
|
|
edge_count = len(edges)
|
|
density = edge_count / (node_count * (node_count - 1)) if node_count > 1 else 0
|
|
|
|
# Find isolated nodes
|
|
connected_nodes = set()
|
|
for edge in edges:
|
|
connected_nodes.add(edge["source"])
|
|
connected_nodes.add(edge["target"])
|
|
isolated_count = len([n for n in nodes if n["id"] not in connected_nodes])
|
|
|
|
# Create summary with raw data only
|
|
summary = {
|
|
"statistics": {
|
|
"total_nodes": node_count,
|
|
"total_edges": edge_count,
|
|
"graph_density": round(density, 4),
|
|
"isolated_nodes": isolated_count,
|
|
"average_connections": round(edge_count / node_count, 2) if node_count > 0 else 0
|
|
},
|
|
"top_connected_nodes": top_connected,
|
|
"cycles_found": [
|
|
{
|
|
"size": cycle["size"],
|
|
"nodes": cycle["nodes"][:5] + (["..."] if len(cycle["nodes"]) > 5 else [])
|
|
}
|
|
for cycle in cycles[:5]
|
|
],
|
|
"file_types": self._count_file_types(nodes),
|
|
"connection_distribution": {
|
|
"nodes_with_20_plus_connections": len([c for c in connection_counts if c["total_connections"] > 20]),
|
|
"nodes_with_30_plus_inbound": len([c for c in connection_counts if c["in_degree"] > 30]),
|
|
"cycle_count": len(cycles) if len(nodes) < 500 else f"{len(cycles)}+ (limited search)",
|
|
}
|
|
}
|
|
|
|
return summary
|
|
|
|
def _count_file_types(self, nodes: list[dict]) -> dict[str, int]:
|
|
"""Count nodes by file extension - pure counting, no interpretation."""
|
|
ext_counts = defaultdict(int)
|
|
for node in nodes: # Process all nodes
|
|
if "file" in node:
|
|
ext = Path(node["file"]).suffix or "no_ext"
|
|
ext_counts[ext] += 1
|
|
# Return top 10 extensions
|
|
sorted_exts = sorted(ext_counts.items(), key=lambda x: x[1], reverse=True)
|
|
return dict(sorted_exts[:10])
|
|
|
|
def identify_hotspots(self, graph: dict[str, Any], top_n: int = 10) -> list[dict[str, Any]]:
|
|
"""
|
|
Identify hotspot nodes based on connectivity (in/out degree).
|
|
|
|
Pure graph algorithm that identifies most connected nodes
|
|
without interpretation or scoring.
|
|
|
|
Args:
|
|
graph: Graph with 'nodes' and 'edges'
|
|
top_n: Number of top hotspots to return
|
|
|
|
Returns:
|
|
List of hotspot nodes with their degree counts
|
|
"""
|
|
# Calculate in/out degrees
|
|
in_degree = defaultdict(int)
|
|
out_degree = defaultdict(int)
|
|
|
|
for edge in graph.get("edges", []):
|
|
out_degree[edge["source"]] += 1
|
|
in_degree[edge["target"]] += 1
|
|
|
|
# Calculate total connections for each node
|
|
hotspots = []
|
|
for node in graph.get("nodes", []):
|
|
node_id = node["id"]
|
|
in_deg = in_degree[node_id]
|
|
out_deg = out_degree[node_id]
|
|
total = in_deg + out_deg
|
|
|
|
if total > 0: # Only include connected nodes
|
|
hotspots.append({
|
|
"id": node_id,
|
|
"in_degree": in_deg,
|
|
"out_degree": out_deg,
|
|
"total_connections": total,
|
|
"file": node.get("file", node_id),
|
|
"lang": node.get("lang", "unknown")
|
|
})
|
|
|
|
# Sort by total connections and return top N
|
|
hotspots.sort(key=lambda x: x["total_connections"], reverse=True)
|
|
return hotspots[:top_n]
|
|
|
|
def calculate_node_degrees(self, graph: dict[str, Any]) -> dict[str, dict[str, int]]:
|
|
"""
|
|
Calculate in-degree and out-degree for all nodes.
|
|
|
|
Pure counting algorithm without interpretation.
|
|
|
|
Args:
|
|
graph: Graph with edges
|
|
|
|
Returns:
|
|
Dict mapping node IDs to degree counts
|
|
"""
|
|
degrees = defaultdict(lambda: {"in_degree": 0, "out_degree": 0})
|
|
|
|
for edge in graph.get("edges", []):
|
|
degrees[edge["source"]]["out_degree"] += 1
|
|
degrees[edge["target"]]["in_degree"] += 1
|
|
|
|
return dict(degrees)
|
|
|
|
def analyze_impact(self, graph: dict[str, Any], targets: list[str], max_depth: int = 3) -> dict[str, Any]:
|
|
"""
|
|
Analyze impact of changes to target nodes.
|
|
|
|
Wrapper method for impact_of_change to match expected API.
|
|
|
|
Args:
|
|
graph: Graph with 'nodes' and 'edges'
|
|
targets: List of target node IDs
|
|
max_depth: Maximum traversal depth
|
|
|
|
Returns:
|
|
Impact analysis results with upstream/downstream effects
|
|
"""
|
|
# Use existing impact_of_change method
|
|
result = self.impact_of_change(targets, graph, None, max_depth)
|
|
|
|
# Add all_impacted field for compatibility
|
|
all_impacted = set(targets) | set(result.get("upstream", [])) | set(result.get("downstream", []))
|
|
result["all_impacted"] = sorted(all_impacted)
|
|
|
|
return result |