lnflow/analyze_data.py

#!/usr/bin/env python3
"""Analyze collected channel data to understand patterns"""

import json
import os
from pathlib import Path
import pandas as pd
import numpy as np
from typing import Dict, List, Any

def load_channel_data(data_dir: Path) -> List[Dict[str, Any]]:
    """Load all channel detail files"""
    channels = []
    channel_files = data_dir.glob("channels/*_details.json")

    for file in channel_files:
        with open(file, 'r') as f:
            try:
                data = json.load(f)
                channels.append(data)
            except Exception as e:
                print(f"Error loading {file}: {e}")

    return channels

def analyze_channels(channels: List[Dict[str, Any]]) -> pd.DataFrame:
    """Convert channel data to DataFrame for analysis"""
    rows = []

    for ch in channels:
        row = {
            'channel_id': ch.get('channelIdCompact', ''),
            'capacity': int(ch.get('capacitySat', 0)),
            'local_balance': int(ch.get('balance', {}).get('localBalanceSat', 0)),
            'remote_balance': int(ch.get('balance', {}).get('remoteBalanceSat', 0)),
            'local_fee_rate': ch.get('policies', {}).get('local', {}).get('feeRatePpm', 0),
            'remote_fee_rate': ch.get('policies', {}).get('remote', {}).get('feeRatePpm', 0),
            'earned_msat': int(ch.get('feeReport', {}).get('earnedMilliSat', 0)),
            'sourced_msat': int(ch.get('feeReport', {}).get('sourcedMilliSat', 0)),
            'total_sent_msat': int(ch.get('flowReport', {}).get('totalSentMilliSat', 0)),
            'total_received_msat': int(ch.get('flowReport', {}).get('totalReceivedMilliSat', 0)),
            'forwarded_sent_msat': int(ch.get('flowReport', {}).get('forwardedSentMilliSat', 0)),
            'forwarded_received_msat': int(ch.get('flowReport', {}).get('forwardedReceivedMilliSat', 0)),
            'remote_alias': ch.get('remoteAlias', 'Unknown'),
            'active': ch.get('status', {}).get('active', False),
            'private': ch.get('status', {}).get('private', False),
            'open_initiator': ch.get('openInitiator', ''),
            'num_updates': int(ch.get('numUpdates', 0)),
            'rating': ch.get('rating', {}).get('rating', -1),
        }

        # Calculate derived metrics
        row['balance_ratio'] = row['local_balance'] / row['capacity'] if row['capacity'] > 0 else 0.5
        row['total_flow_sats'] = (row['total_sent_msat'] + row['total_received_msat']) / 1000
        row['net_flow_sats'] = (row['total_received_msat'] - row['total_sent_msat']) / 1000
        row['total_fees_sats'] = (row['earned_msat'] + row['sourced_msat']) / 1000
        row['fee_per_flow'] = row['total_fees_sats'] / row['total_flow_sats'] if row['total_flow_sats'] > 0 else 0

        rows.append(row)

    return pd.DataFrame(rows)

def print_analysis(df: pd.DataFrame):
    """Print detailed analysis of channels"""
    print("=== Channel Network Analysis ===\n")

    # Overall statistics
    print(f"Total Channels: {len(df)}")
    print(f"Total Capacity: {df['capacity'].sum():,} sats")
    print(f"Average Channel Size: {df['capacity'].mean():,.0f} sats")
    print(f"Total Local Balance: {df['local_balance'].sum():,} sats")
    print(f"Total Remote Balance: {df['remote_balance'].sum():,} sats")

    # Fee statistics
    print(f"\n=== Fee Statistics ===")
    print(f"Average Local Fee Rate: {df['local_fee_rate'].mean():.0f} ppm")
    print(f"Median Local Fee Rate: {df['local_fee_rate'].median():.0f} ppm")
    print(f"Fee Rate Range: {df['local_fee_rate'].min()} - {df['local_fee_rate'].max()} ppm")
    print(f"Total Fees Earned: {df['total_fees_sats'].sum():,.0f} sats")

    # Flow statistics
    print(f"\n=== Flow Statistics ===")
    active_channels = df[df['total_flow_sats'] > 0]
    print(f"Active Channels: {len(active_channels)} ({len(active_channels)/len(df)*100:.1f}%)")
    print(f"Total Flow: {df['total_flow_sats'].sum():,.0f} sats")
    print(f"Average Flow per Active Channel: {active_channels['total_flow_sats'].mean():,.0f} sats")

    # Balance distribution
    print(f"\n=== Balance Distribution ===")
    balanced = df[(df['balance_ratio'] > 0.3) & (df['balance_ratio'] < 0.7)]
    depleted = df[df['balance_ratio'] < 0.1]
    full = df[df['balance_ratio'] > 0.9]
    print(f"Balanced (30-70%): {len(balanced)} channels")
    print(f"Depleted (<10%): {len(depleted)} channels")
    print(f"Full (>90%): {len(full)} channels")

    # Top performers
    print(f"\n=== Top 10 Fee Earners ===")
    top_earners = df.nlargest(10, 'total_fees_sats')[['channel_id', 'remote_alias', 'capacity', 'total_fees_sats', 'local_fee_rate', 'balance_ratio']]
    print(top_earners.to_string(index=False))

    # High flow channels
    print(f"\n=== Top 10 High Flow Channels ===")
    high_flow = df.nlargest(10, 'total_flow_sats')[['channel_id', 'remote_alias', 'total_flow_sats', 'total_fees_sats', 'local_fee_rate']]
    print(high_flow.to_string(index=False))

    # Correlation analysis
    print(f"\n=== Correlation Analysis ===")
    correlations = {
        'Fee Rate vs Earnings': df['local_fee_rate'].corr(df['total_fees_sats']),
        'Flow vs Earnings': df['total_flow_sats'].corr(df['total_fees_sats']),
        'Capacity vs Flow': df['capacity'].corr(df['total_flow_sats']),
        'Balance Ratio vs Flow': df['balance_ratio'].corr(df['total_flow_sats']),
    }
    for metric, corr in correlations.items():
        print(f"{metric}: {corr:.3f}")

    # Fee optimization opportunities
    print(f"\n=== Optimization Opportunities ===")

    # High flow, low fee channels
    high_flow_low_fee = df[(df['total_flow_sats'] > df['total_flow_sats'].quantile(0.75)) &
                           (df['local_fee_rate'] < df['local_fee_rate'].median())]
    print(f"\nHigh Flow + Low Fees ({len(high_flow_low_fee)} channels):")
    if len(high_flow_low_fee) > 0:
        print(high_flow_low_fee[['channel_id', 'remote_alias', 'total_flow_sats', 'local_fee_rate', 'total_fees_sats']].head())

    # Imbalanced high-value channels
    imbalanced = df[((df['balance_ratio'] < 0.2) | (df['balance_ratio'] > 0.8)) &
                    (df['capacity'] > df['capacity'].median())]
    print(f"\nImbalanced High-Capacity Channels ({len(imbalanced)} channels):")
    if len(imbalanced) > 0:
        print(imbalanced[['channel_id', 'remote_alias', 'capacity', 'balance_ratio', 'net_flow_sats']].head())

if __name__ == "__main__":
    data_dir = Path("data_samples")

    print("Loading channel data...")
    channels = load_channel_data(data_dir)

    print(f"Loaded {len(channels)} channels\n")

    df = analyze_channels(channels)
    print_analysis(df)

    # Save processed data
    df.to_csv("channel_analysis.csv", index=False)
    print(f"\nAnalysis saved to channel_analysis.csv")