Refactor extract.py: add utility functions, simplify plugin detection, and improve code organization

This commit is contained in:
Gigi
2025-04-01 15:22:34 +01:00
parent ae803f13a0
commit a9509ded80

View File

@@ -7,6 +7,20 @@ from pathlib import Path
import time
from typing import List
def get_base_name(plugin_name: str) -> str:
"""Get the base name of a plugin without any suffixes."""
return plugin_name.split('.')[0]
def read_file(file_path: Path) -> str:
"""Read text from a file."""
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
def write_file(file_path: Path, content: str) -> None:
"""Write text to a file."""
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
def load_plugins() -> dict[str, str]:
"""Load all plugins from the plugins directory."""
plugin_dir = Path("plugins")
@@ -16,46 +30,39 @@ def load_plugins() -> dict[str, str]:
for plugin_file in plugin_dir.glob("*.md"):
plugin_name = plugin_file.stem
if plugin_name != "summary": # Skip the summary plugin as it's used differently
# Keep the full plugin name including all suffixes
plugins[plugin_name] = plugin_file.read_text(encoding='utf-8')
plugins[plugin_name] = read_file(plugin_file)
return plugins
def determine_active_plugins(text: str, available_plugins: List[str]) -> List[str]:
"""Determine which plugins should be run on this transcript."""
active_plugins = []
active_plugins = set() # Use a set to avoid duplicates
# First, add all .all plugins
for plugin_name in available_plugins:
if '.all' in plugin_name:
# Remove all suffixes for the base name
base_name = plugin_name.split('.')[0]
active_plugins.append(base_name)
# Then check for each plugin's content type
for plugin_name in available_plugins:
# Convert plugin name to search pattern (e.g., "blog_post" -> "blog post")
search_pattern = plugin_name.replace('_', ' ')
base_name = get_base_name(plugin_name)
# Check if this is an "or" plugin
# Always include plugins with .all suffix
if '.all' in plugin_name:
active_plugins.add(base_name)
continue
# For .or plugins, check if any word matches
if '.or' in plugin_name:
# Get base name without suffixes
base_name = plugin_name.split('.')[0]
# For or plugins, check if any of the words match
words = base_name.split('_')
if any(re.search(r'\b' + word + r'\b', text.lower()) for word in words):
active_plugins.append(base_name)
active_plugins.add(base_name)
# For regular plugins, check for the exact pattern
else:
# For regular plugins, check for the exact pattern
search_pattern = plugin_name.replace('_', ' ')
if re.search(r'\b' + search_pattern + r'\b', text.lower()):
active_plugins.append(plugin_name)
active_plugins.add(base_name)
return active_plugins
return list(active_plugins)
def generate_additional_content(content_type: str, transcript_text: str, summary_text: str, plugins: dict[str, str]) -> str:
"""Generate additional content based on the content type."""
# Find the plugin key that matches the content type (with or without .or suffix)
plugin_key = next(key for key in plugins.keys() if key.startswith(content_type))
def generate_additional_content(plugin_base_name: str, transcript_text: str, summary_text: str, plugins: dict[str, str]) -> str:
"""Generate additional content using the specified plugin."""
# Find the plugin key that matches the base name
plugin_key = next(key for key in plugins.keys() if get_base_name(key) == plugin_base_name)
prompt_template = plugins[plugin_key]
prompt = prompt_template.format(transcript=transcript_text, summary=summary_text)
@@ -89,8 +96,7 @@ def main():
# Create output directories for each plugin
for plugin_name in plugins.keys():
# Get base name without any suffixes
base_name = plugin_name.split('.')[0]
base_name = get_base_name(plugin_name)
output_dir = voice_memo_dir / f"{base_name}s" # Pluralize the plugin name
output_dir.mkdir(parents=True, exist_ok=True)
# Store with base name as key
@@ -100,30 +106,27 @@ def main():
print("Extracting content...")
# Read transcript
with open(input_file, 'r', encoding='utf-8') as f:
transcript_text = f.read()
transcript_text = read_file(input_file)
# Read summary if it exists
summary_file = input_file.parent / f"{input_file.stem}_summary.txt"
summary_text = ""
if summary_file.exists():
with open(summary_file, 'r', encoding='utf-8') as f:
summary_text = f.read()
summary_text = read_file(summary_file)
# Determine which plugins to run
active_plugins = determine_active_plugins(transcript_text, list(plugins.keys()))
if active_plugins:
for plugin_name in active_plugins:
print(f" Running {plugin_name} plugin...")
additional_content = generate_additional_content(plugin_name, transcript_text, summary_text, plugins)
additional_content = generate_additional_content(get_base_name(plugin_name), transcript_text, summary_text, plugins)
# Save to appropriate directory with timestamp
timestamp = time.strftime("%Y%m%d_%H%M%S")
filename = input_file.stem
output_file = output_dirs[plugin_name] / f"{filename}_{timestamp}.md"
output_file = output_dirs[get_base_name(plugin_name)] / f"{filename}_{timestamp}.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(additional_content)
write_file(output_file, additional_content)
print(f" Content saved to: {output_file}")
else:
print(" No matching plugins found for this transcript")