mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
493 lines
22 KiB
Python
493 lines
22 KiB
Python
"""Extraction module - pure courier model for data chunking.
|
|
|
|
This module implements the courier model: takes raw tool output and chunks it
|
|
into manageable pieces for AI processing WITHOUT any filtering or interpretation.
|
|
|
|
Pure Courier Principles:
|
|
- NO filtering by severity or importance
|
|
- NO deduplication or sampling
|
|
- NO interpretation of findings
|
|
- ONLY chunks files if they exceed 65KB
|
|
- ALL data preserved exactly as generated
|
|
|
|
The AI consumer decides what's important, not TheAuditor.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
from collections import defaultdict
|
|
from theauditor.config_runtime import load_runtime_config
|
|
|
|
|
|
# DELETED: All smart extraction functions removed
|
|
# Pure courier model - no filtering, only chunking if needed
|
|
|
|
|
|
def _chunk_large_file(raw_path: Path, max_chunk_size: Optional[int] = None) -> Optional[List[Tuple[Path, int]]]:
|
|
"""Split large files into chunks of configured max size."""
|
|
# Load config if not provided
|
|
if max_chunk_size is None:
|
|
config = load_runtime_config()
|
|
max_chunk_size = config["limits"]["max_chunk_size"]
|
|
|
|
# Get max chunks per file from config
|
|
config = load_runtime_config()
|
|
max_chunks_per_file = config["limits"]["max_chunks_per_file"]
|
|
|
|
chunks = []
|
|
try:
|
|
# Handle non-JSON files (like .dot, .txt, etc.)
|
|
if raw_path.suffix != '.json':
|
|
# Read as text and chunk if needed
|
|
with open(raw_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
# Check if file needs chunking
|
|
if len(content) <= max_chunk_size:
|
|
# Small enough, copy as-is
|
|
output_path = raw_path.parent.parent / 'readthis' / raw_path.name
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
size = output_path.stat().st_size
|
|
print(f" [COPIED] {raw_path.name} → {output_path.name} ({size:,} bytes)")
|
|
return [(output_path, size)]
|
|
else:
|
|
# Need to chunk text file
|
|
base_name = raw_path.stem
|
|
ext = raw_path.suffix
|
|
chunk_num = 0
|
|
position = 0
|
|
|
|
while position < len(content) and chunk_num < max_chunks_per_file:
|
|
chunk_num += 1
|
|
chunk_end = min(position + max_chunk_size, len(content))
|
|
chunk_content = content[position:chunk_end]
|
|
|
|
output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(chunk_content)
|
|
size = output_path.stat().st_size
|
|
chunks.append((output_path, size))
|
|
print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({size:,} bytes)")
|
|
|
|
position = chunk_end
|
|
|
|
if position < len(content):
|
|
print(f" [TRUNCATED] {raw_path.name} - stopped at {max_chunks_per_file} chunks")
|
|
|
|
return chunks
|
|
|
|
# Handle JSON files
|
|
with open(raw_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Check if file needs chunking
|
|
full_json = json.dumps(data, indent=2)
|
|
if len(full_json) <= max_chunk_size:
|
|
# Small enough, copy as-is
|
|
output_path = raw_path.parent.parent / 'readthis' / raw_path.name
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(full_json)
|
|
size = output_path.stat().st_size
|
|
print(f" [COPIED] {raw_path.name} → {output_path.name} ({size:,} bytes)")
|
|
return [(output_path, size)]
|
|
|
|
# File needs chunking
|
|
base_name = raw_path.stem
|
|
ext = raw_path.suffix
|
|
|
|
# Handle different data structures
|
|
if isinstance(data, list):
|
|
# For lists, chunk by items
|
|
chunk_num = 0
|
|
current_chunk = []
|
|
current_size = 100 # Account for JSON structure overhead
|
|
|
|
for item in data:
|
|
item_json = json.dumps(item, indent=2)
|
|
item_size = len(item_json)
|
|
|
|
if current_size + item_size > max_chunk_size and current_chunk:
|
|
# Check chunk limit
|
|
if chunk_num >= max_chunks_per_file:
|
|
print(f" [TRUNCATED] {raw_path.name} - stopped at {max_chunks_per_file} chunks (would have created more)")
|
|
break
|
|
|
|
# Write current chunk
|
|
chunk_num += 1
|
|
output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(current_chunk, f, indent=2)
|
|
size = output_path.stat().st_size
|
|
chunks.append((output_path, size))
|
|
print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({size:,} bytes)")
|
|
|
|
# Start new chunk
|
|
current_chunk = [item]
|
|
current_size = item_size + 100
|
|
else:
|
|
current_chunk.append(item)
|
|
current_size += item_size
|
|
|
|
# Write final chunk (only if under limit)
|
|
if current_chunk and chunk_num < max_chunks_per_file:
|
|
chunk_num += 1
|
|
output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}"
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(current_chunk, f, indent=2)
|
|
size = output_path.stat().st_size
|
|
chunks.append((output_path, size))
|
|
print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({size:,} bytes)")
|
|
|
|
elif isinstance(data, dict):
|
|
# For dicts with lists (like findings, paths), chunk the lists
|
|
# Determine the correct key to chunk on
|
|
if base_name == 'taint_analysis':
|
|
# For taint analysis, we need to merge ALL findings into one list
|
|
# because they're split across multiple keys
|
|
if 'taint_paths' in data or 'all_rule_findings' in data:
|
|
# Merge all findings into a single list for chunking
|
|
all_taint_items = []
|
|
|
|
# Add taint paths
|
|
if 'taint_paths' in data:
|
|
for item in data['taint_paths']:
|
|
item['finding_type'] = 'taint_path'
|
|
all_taint_items.append(item)
|
|
|
|
# Add all rule findings
|
|
if 'all_rule_findings' in data:
|
|
for item in data['all_rule_findings']:
|
|
item['finding_type'] = 'rule_finding'
|
|
all_taint_items.append(item)
|
|
|
|
# Add infrastructure issues only if they're different from all_rule_findings
|
|
# (to avoid duplicates when they're the same list)
|
|
if 'infrastructure_issues' in data:
|
|
# Check if they're different objects (not the same list)
|
|
if data['infrastructure_issues'] is not data.get('all_rule_findings'):
|
|
# Only add if they're actually different content
|
|
infra_set = {json.dumps(item, sort_keys=True) for item in data['infrastructure_issues']}
|
|
rules_set = {json.dumps(item, sort_keys=True) for item in data.get('all_rule_findings', [])}
|
|
if infra_set != rules_set:
|
|
for item in data['infrastructure_issues']:
|
|
item['finding_type'] = 'infrastructure'
|
|
all_taint_items.append(item)
|
|
|
|
# Add paths (data flow paths) - these are often duplicates of taint_paths but may have extra info
|
|
if 'paths' in data:
|
|
# Check if different from taint_paths
|
|
paths_set = {json.dumps(item, sort_keys=True) for item in data['paths']}
|
|
taint_set = {json.dumps(item, sort_keys=True) for item in data.get('taint_paths', [])}
|
|
if paths_set != taint_set:
|
|
for item in data['paths']:
|
|
item['finding_type'] = 'path'
|
|
all_taint_items.append(item)
|
|
|
|
# Add vulnerabilities - these are the final analyzed vulnerabilities
|
|
if 'vulnerabilities' in data:
|
|
for item in data['vulnerabilities']:
|
|
item['finding_type'] = 'vulnerability'
|
|
all_taint_items.append(item)
|
|
|
|
# Create a new data structure with merged findings
|
|
data = {
|
|
'success': data.get('success', True),
|
|
'summary': data.get('summary', {}),
|
|
'total_vulnerabilities': data.get('total_vulnerabilities', len(all_taint_items)),
|
|
'sources_found': data.get('sources_found', 0),
|
|
'sinks_found': data.get('sinks_found', 0),
|
|
'merged_findings': all_taint_items
|
|
}
|
|
list_key = 'merged_findings'
|
|
else:
|
|
list_key = 'paths'
|
|
elif 'all_findings' in data:
|
|
# CRITICAL: FCE findings are pre-sorted by severity via finding_priority.py
|
|
# The order MUST be preserved during chunking to ensure critical issues
|
|
# appear in chunk01. DO NOT sort or shuffle these findings!
|
|
list_key = 'all_findings'
|
|
|
|
# Log for verification
|
|
if data.get(list_key):
|
|
first_items = data[list_key][:3] if len(data[list_key]) >= 3 else data[list_key]
|
|
severities = [item.get('severity', 'unknown') for item in first_items]
|
|
print(f"[EXTRACTION] Processing FCE with {len(data[list_key])} pre-sorted findings")
|
|
print(f"[EXTRACTION] First 3 severities: {severities}")
|
|
elif 'findings' in data:
|
|
list_key = 'findings'
|
|
elif 'vulnerabilities' in data:
|
|
list_key = 'vulnerabilities'
|
|
elif 'issues' in data:
|
|
list_key = 'issues'
|
|
elif 'edges' in data:
|
|
list_key = 'edges' # For call_graph.json and import_graph.json
|
|
elif 'nodes' in data:
|
|
list_key = 'nodes' # For graph files with nodes
|
|
elif 'taint_paths' in data:
|
|
list_key = 'taint_paths'
|
|
elif 'paths' in data:
|
|
list_key = 'paths'
|
|
elif 'dependencies' in data:
|
|
list_key = 'dependencies' # For deps.json
|
|
elif 'files' in data:
|
|
list_key = 'files' # For file lists
|
|
elif 'results' in data:
|
|
list_key = 'results' # For analysis results
|
|
else:
|
|
list_key = None
|
|
|
|
if list_key:
|
|
items = data.get(list_key, [])
|
|
|
|
# Extract minimal metadata (don't duplicate everything)
|
|
metadata = {}
|
|
for key in ['success', 'summary', 'total_vulnerabilities', 'chunk_info']:
|
|
if key in data:
|
|
metadata[key] = data[key]
|
|
|
|
# Calculate actual metadata size
|
|
metadata_json = json.dumps(metadata, indent=2)
|
|
metadata_size = len(metadata_json)
|
|
|
|
chunk_num = 0
|
|
chunk_items = []
|
|
current_size = metadata_size + 200 # Actual metadata size + bracket overhead
|
|
|
|
for item in items:
|
|
item_json = json.dumps(item, indent=2)
|
|
item_size = len(item_json)
|
|
|
|
if current_size + item_size > max_chunk_size and chunk_items:
|
|
# Check chunk limit
|
|
if chunk_num >= max_chunks_per_file:
|
|
print(f" [TRUNCATED] {raw_path.name} - stopped at {max_chunks_per_file} chunks (would have created more)")
|
|
break
|
|
|
|
# Write current chunk
|
|
chunk_num += 1
|
|
chunk_data = metadata.copy()
|
|
chunk_data[list_key] = chunk_items
|
|
chunk_data['chunk_info'] = {
|
|
'chunk_number': chunk_num,
|
|
'total_items_in_chunk': len(chunk_items),
|
|
'original_total_items': len(items),
|
|
'list_key': list_key,
|
|
'truncated': chunk_num >= max_chunks_per_file # Mark if this is the last allowed chunk
|
|
}
|
|
|
|
output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(chunk_data, f, indent=2)
|
|
size = output_path.stat().st_size
|
|
chunks.append((output_path, size))
|
|
print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({len(chunk_items)} items, {size:,} bytes)")
|
|
|
|
# Start new chunk
|
|
chunk_items = [item]
|
|
current_size = metadata_size + item_size + 200
|
|
else:
|
|
chunk_items.append(item)
|
|
current_size += item_size
|
|
|
|
# Write final chunk (only if under limit)
|
|
if chunk_items and chunk_num < max_chunks_per_file:
|
|
chunk_num += 1
|
|
chunk_data = metadata.copy()
|
|
chunk_data[list_key] = chunk_items
|
|
chunk_data['chunk_info'] = {
|
|
'chunk_number': chunk_num,
|
|
'total_items_in_chunk': len(chunk_items),
|
|
'original_total_items': len(items),
|
|
'list_key': list_key,
|
|
'truncated': False # This is the final chunk within limit
|
|
}
|
|
|
|
output_path = raw_path.parent.parent / 'readthis' / f"{base_name}_chunk{chunk_num:02d}{ext}"
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(chunk_data, f, indent=2)
|
|
size = output_path.stat().st_size
|
|
chunks.append((output_path, size))
|
|
print(f" [CHUNKED] {raw_path.name} → {output_path.name} ({len(chunk_items)} items, {size:,} bytes)")
|
|
else:
|
|
# No recognized list key - shouldn't happen now with expanded list
|
|
# Log warning and copy as-is
|
|
print(f" [WARNING] No chunkable list found in {raw_path.name}, copying as-is")
|
|
output_path = raw_path.parent.parent / 'readthis' / raw_path.name
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2)
|
|
size = output_path.stat().st_size
|
|
chunks.append((output_path, size))
|
|
print(f" [COPIED] {raw_path.name} → {output_path.name} ({size:,} bytes)")
|
|
|
|
return chunks
|
|
|
|
except Exception as e:
|
|
print(f" [ERROR] Failed to chunk {raw_path.name}: {e}")
|
|
return None # Return None to signal failure, not empty list
|
|
|
|
|
|
def _copy_as_is(raw_path: Path) -> Tuple[Optional[Path], int]:
|
|
"""Copy small files as-is or chunk if >65KB."""
|
|
chunks = _chunk_large_file(raw_path)
|
|
if chunks is None:
|
|
# Chunking failed
|
|
return None, -1 # Signal error with -1
|
|
elif chunks:
|
|
# Return the first chunk info for compatibility
|
|
return chunks[0] if len(chunks) == 1 else (None, sum(s for _, s in chunks))
|
|
return None, 0
|
|
|
|
|
|
def extract_all_to_readthis(root_path_str: str, budget_kb: int = 1500) -> bool:
|
|
"""Main function for extracting readthis chunks from raw data.
|
|
|
|
Implements intelligent extraction with prioritization to stay within
|
|
budget while preserving all critical security findings.
|
|
|
|
Args:
|
|
root_path_str: Root directory path as string
|
|
budget_kb: Maximum total size in KB for all readthis files (default 1000KB)
|
|
|
|
Returns:
|
|
True if extraction completed successfully, False otherwise
|
|
"""
|
|
root_path = Path(root_path_str)
|
|
raw_dir = root_path / ".pf" / "raw"
|
|
readthis_dir = root_path / ".pf" / "readthis"
|
|
|
|
print("\n" + "="*60)
|
|
print("[EXTRACTION] Smart extraction with 1MB budget")
|
|
print("="*60)
|
|
|
|
# Check if raw directory exists
|
|
if not raw_dir.exists():
|
|
print(f"[WARNING] Raw directory does not exist: {raw_dir}")
|
|
print("[INFO] No raw data to extract - skipping extraction phase")
|
|
return True
|
|
|
|
# Ensure readthis directory exists
|
|
try:
|
|
readthis_dir.mkdir(parents=True, exist_ok=True)
|
|
print(f"[OK] Readthis directory ready: {readthis_dir}")
|
|
except Exception as e:
|
|
print(f"[ERROR] Failed to create readthis directory: {e}")
|
|
return False
|
|
|
|
# Discover ALL files in raw directory dynamically (courier model)
|
|
raw_files = []
|
|
for file_path in raw_dir.iterdir():
|
|
if file_path.is_file():
|
|
raw_files.append(file_path.name)
|
|
|
|
print(f"[DISCOVERED] Found {len(raw_files)} files in raw directory")
|
|
|
|
# Pure courier model - no smart extraction, just chunking if needed
|
|
# Build extraction strategy dynamically
|
|
extraction_strategy = []
|
|
for filename in sorted(raw_files):
|
|
# All files get same treatment: chunk if needed
|
|
extraction_strategy.append((filename, 100, _copy_as_is))
|
|
|
|
total_budget = budget_kb * 1024 # Convert to bytes
|
|
total_used = 0
|
|
extracted_files = []
|
|
skipped_files = []
|
|
failed_files = [] # Track failures
|
|
|
|
print(f"[BUDGET] Total budget: {budget_kb}KB ({total_budget:,} bytes)")
|
|
print(f"[STRATEGY] Pure courier model - no filtering\n")
|
|
|
|
for filename, file_budget_kb, extractor in extraction_strategy:
|
|
raw_path = raw_dir / filename
|
|
|
|
if not raw_path.exists():
|
|
continue
|
|
|
|
print(f"[PROCESSING] {filename}")
|
|
|
|
# Just chunk everything - ignore budget for chunking
|
|
# The whole point is to break large files into manageable pieces
|
|
chunks = _chunk_large_file(raw_path)
|
|
|
|
if chunks is None:
|
|
# Chunking failed for this file
|
|
print(f" [FAILED] {filename} - chunking error")
|
|
failed_files.append(filename)
|
|
continue
|
|
|
|
if chunks:
|
|
for chunk_path, chunk_size in chunks:
|
|
# Optionally check budget per chunk (or ignore completely)
|
|
if total_used + chunk_size > total_budget:
|
|
# Could skip remaining chunks or just ignore budget
|
|
# For now, let's just ignore budget and extract everything
|
|
pass
|
|
|
|
total_used += chunk_size
|
|
extracted_files.append((chunk_path.name, chunk_size))
|
|
|
|
# Create extraction summary
|
|
summary = {
|
|
'extraction_timestamp': str(Path(root_path_str).stat().st_mtime),
|
|
'budget_kb': budget_kb,
|
|
'total_used_bytes': total_used,
|
|
'total_used_kb': total_used // 1024,
|
|
'utilization_percent': (total_used / total_budget) * 100,
|
|
'files_extracted': len(extracted_files),
|
|
'files_skipped': len(skipped_files),
|
|
'files_failed': len(failed_files),
|
|
'extracted': [{'file': f, 'size': s} for f, s in extracted_files],
|
|
'skipped': skipped_files,
|
|
'failed': failed_files,
|
|
'strategy': 'Pure courier model - chunk if needed, no filtering'
|
|
}
|
|
|
|
summary_path = readthis_dir / 'extraction_summary.json'
|
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
# Summary report
|
|
print("\n" + "="*60)
|
|
print("[EXTRACTION COMPLETE]")
|
|
print(f" Files extracted: {len(extracted_files)}")
|
|
print(f" Files skipped: {len(skipped_files)}")
|
|
print(f" Files failed: {len(failed_files)}")
|
|
print(f" Total size: {total_used:,} bytes ({total_used//1024}KB)")
|
|
print(f" Budget used: {(total_used/total_budget)*100:.1f}%")
|
|
print(f" Summary saved: {summary_path}")
|
|
|
|
# List what was extracted
|
|
print("\n[EXTRACTED FILES]")
|
|
for filename, size in extracted_files:
|
|
print(f" {filename:30} {size:8,} bytes ({size//1024:4}KB)")
|
|
|
|
if skipped_files:
|
|
print("\n[SKIPPED FILES]")
|
|
for filename in skipped_files:
|
|
print(f" {filename}")
|
|
|
|
if failed_files:
|
|
print("\n[FAILED FILES]")
|
|
for filename in failed_files:
|
|
print(f" {filename}")
|
|
|
|
print("\n[KEY INSIGHTS]")
|
|
print(" ✓ All findings preserved - no filtering")
|
|
print(" ✓ Pure courier model - no interpretation")
|
|
print(" ✓ Files chunked only if >65KB")
|
|
print(" ✓ Complete data for AI consumption")
|
|
print("="*60)
|
|
|
|
# Return False if any files failed, True only if all succeeded
|
|
if failed_files:
|
|
print(f"\n[ERROR] Extraction failed for {len(failed_files)} files")
|
|
return False
|
|
return True |