Files
turso/testing/gen-bigass-database.py
Jussi Saurio cc46bfb6d8 add gen-bigass-database.py
ability to create an equivalent big schema so that the issues
reported by Preston using that larger schema can be reproduced
and fixed.
2025-09-08 13:05:33 -04:00

408 lines
14 KiB
Python

#!/usr/bin/env python3
import random
import sqlite3
from faker import Faker
conn = sqlite3.connect("testing/testing-bigass.db")
cursor = conn.cursor()
fake = Faker()
cursor.execute("""
CREATE TABLE IF NOT EXISTS users (
id INTEGER PRIMARY KEY,
first_name TEXT,
last_name TEXT,
email TEXT,
phone_number TEXT,
address TEXT,
city TEXT,
state TEXT,
zipcode TEXT,
age INTEGER,
created_at TIMESTAMP,
updated_at TIMESTAMP
)
""")
cursor.execute(
"""
CREATE TABLE products (
id INTEGER PRIMARY KEY,
name TEXT,
price REAL
);
""",
[],
)
# specific products we already test for
cursor.execute("INSERT INTO products VALUES(1,'hat',79.0);")
cursor.execute("INSERT INTO products VALUES(2,'cap',82.0);")
cursor.execute("INSERT INTO products VALUES(3,'shirt',18.0);")
cursor.execute("INSERT INTO products VALUES(4,'sweater',25.0);")
cursor.execute("INSERT INTO products VALUES(5,'sweatshirt',74.0);")
cursor.execute("INSERT INTO products VALUES(6,'shorts',70.0);")
cursor.execute("INSERT INTO products VALUES(7,'jeans',78.0);")
cursor.execute("INSERT INTO products VALUES(8,'sneakers',82.0);")
cursor.execute("INSERT INTO products VALUES(9,'boots',1.0);")
cursor.execute("INSERT INTO products VALUES(10,'coat',33.0);")
cursor.execute("INSERT INTO products VALUES(11,'accessories',81.0);")
for i in range(12, 12001):
name = fake.word().title()
price = round(random.uniform(5.0, 999.99), 2)
cursor.execute("INSERT INTO products (id, name, price) VALUES (?, ?, ?)", [i, name, price])
cursor.execute("""
CREATE TABLE IF NOT EXISTS orders (
id INTEGER PRIMARY KEY,
user_id INTEGER,
order_date TIMESTAMP,
total_amount REAL,
status TEXT,
shipping_address TEXT,
shipping_city TEXT,
shipping_state TEXT,
shipping_zip TEXT,
payment_method TEXT,
tracking_number TEXT,
notes TEXT,
FOREIGN KEY (user_id) REFERENCES users(id)
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS order_items (
id INTEGER PRIMARY KEY,
order_id INTEGER,
product_id INTEGER,
quantity INTEGER,
unit_price REAL,
discount REAL,
tax REAL,
total_price REAL,
FOREIGN KEY (order_id) REFERENCES orders(id),
FOREIGN KEY (product_id) REFERENCES products(id)
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS reviews (
id INTEGER PRIMARY KEY,
product_id INTEGER,
user_id INTEGER,
rating INTEGER,
title TEXT,
comment TEXT,
helpful_count INTEGER,
verified_purchase BOOLEAN,
review_date TIMESTAMP,
FOREIGN KEY (product_id) REFERENCES products(id),
FOREIGN KEY (user_id) REFERENCES users(id)
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS inventory_transactions (
id INTEGER PRIMARY KEY,
product_id INTEGER,
transaction_type TEXT,
quantity INTEGER,
previous_quantity INTEGER,
new_quantity INTEGER,
transaction_date TIMESTAMP,
reference_type TEXT,
reference_id INTEGER,
notes TEXT,
performed_by TEXT,
FOREIGN KEY (product_id) REFERENCES products(id)
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS customer_support_tickets (
id INTEGER PRIMARY KEY,
user_id INTEGER,
order_id INTEGER,
ticket_number TEXT,
category TEXT,
priority TEXT,
status TEXT,
subject TEXT,
description TEXT,
created_at TIMESTAMP,
updated_at TIMESTAMP,
resolved_at TIMESTAMP,
assigned_to TEXT,
resolution_notes TEXT,
FOREIGN KEY (user_id) REFERENCES users(id),
FOREIGN KEY (order_id) REFERENCES orders(id)
)
""")
print("Generating users...")
users_data = []
for i in range(15000):
if i % 1000 == 0:
print(f" Generated {i} users...")
first_name = fake.first_name()
last_name = fake.last_name()
email = fake.email()
phone_number = fake.phone_number()
address = fake.street_address()
city = fake.city()
state = fake.state_abbr()
zipcode = fake.zipcode()
age = fake.random_int(min=18, max=85)
created_at = fake.date_time_between(start_date="-3y", end_date="now")
updated_at = fake.date_time_between(start_date=created_at, end_date="now")
users_data.append(
(first_name, last_name, email, phone_number, address, city, state, zipcode, age, created_at, updated_at)
)
cursor.executemany(
"""
INSERT INTO users (first_name, last_name, email, phone_number, address,
city, state, zipcode, age, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
users_data,
)
print("Generating orders...")
order_statuses = ["pending", "processing", "shipped", "delivered", "cancelled", "refunded"]
payment_methods = ["credit_card", "debit_card", "paypal", "apple_pay", "google_pay", "bank_transfer"]
orders_data = []
for i in range(20000):
if i % 2000 == 0:
print(f" Generated {i} orders...")
user_id = random.randint(1, 15000)
order_date = fake.date_time_between(start_date="-1y", end_date="now")
total_amount = round(random.uniform(10.0, 5000.0), 2)
status = random.choice(order_statuses)
shipping_address = fake.street_address()
shipping_city = fake.city()
shipping_state = fake.state_abbr()
shipping_zip = fake.zipcode()
payment_method = random.choice(payment_methods)
tracking_number = fake.ean13() if status in ["shipped", "delivered"] else None
notes = fake.text(max_nb_chars=100) if random.random() < 0.3 else None
orders_data.append(
(
user_id,
order_date,
total_amount,
status,
shipping_address,
shipping_city,
shipping_state,
shipping_zip,
payment_method,
tracking_number,
notes,
)
)
cursor.executemany(
"""
INSERT INTO orders (user_id, order_date, total_amount, status, shipping_address,
shipping_city, shipping_state, shipping_zip, payment_method,
tracking_number, notes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
orders_data,
)
print("Generating order items...")
order_items_data = []
for order_id in range(1, 20001):
if order_id % 2000 == 0:
print(f" Generated items for {order_id} orders...")
num_items = random.randint(1, 8)
for _ in range(num_items):
product_id = random.randint(1, 12000)
quantity = random.randint(1, 5)
unit_price = round(random.uniform(0.99, 999.99), 2)
discount = round(random.uniform(0, 0.3) * unit_price, 2) if random.random() < 0.2 else 0
tax = round((unit_price - discount) * quantity * 0.08, 2)
total_price = round((unit_price - discount) * quantity + tax, 2)
order_items_data.append((order_id, product_id, quantity, unit_price, discount, tax, total_price))
cursor.executemany(
"""
INSERT INTO order_items (order_id, product_id, quantity, unit_price,
discount, tax, total_price)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
order_items_data,
)
print("Generating reviews...")
reviews_data = []
for i in range(25000):
if i % 2500 == 0:
print(f" Generated {i} reviews...")
product_id = random.randint(1, 12000)
user_id = random.randint(1, 15000)
rating = random.choices([1, 2, 3, 4, 5], weights=[5, 10, 15, 30, 40])[0]
title = fake.catch_phrase()
comment = fake.text(max_nb_chars=500)
helpful_count = random.randint(0, 100)
verified_purchase = random.choice([0, 1])
review_date = fake.date_time_between(start_date="-1y", end_date="now")
reviews_data.append((product_id, user_id, rating, title, comment, helpful_count, verified_purchase, review_date))
cursor.executemany(
"""
INSERT INTO reviews (product_id, user_id, rating, title, comment,
helpful_count, verified_purchase, review_date)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
reviews_data,
)
print("Generating inventory transactions...")
transaction_types = ["purchase", "sale", "return", "adjustment", "transfer", "damage"]
reference_types = ["order", "return", "adjustment", "transfer", "manual"]
inventory_data = []
for i in range(18000):
if i % 2000 == 0:
print(f" Generated {i} inventory transactions...")
product_id = random.randint(1, 12000)
transaction_type = random.choice(transaction_types)
quantity = random.randint(1, 100)
previous_quantity = random.randint(0, 1000)
new_quantity = (
previous_quantity + quantity if transaction_type in ["purchase", "return"] else previous_quantity - quantity
)
new_quantity = max(0, new_quantity)
transaction_date = fake.date_time_between(start_date="-6m", end_date="now")
reference_type = random.choice(reference_types)
reference_id = random.randint(1, 20000) if reference_type == "order" else random.randint(1, 1000)
notes = fake.text(max_nb_chars=100) if random.random() < 0.3 else None
performed_by = fake.name()
inventory_data.append(
(
product_id,
transaction_type,
quantity,
previous_quantity,
new_quantity,
transaction_date,
reference_type,
reference_id,
notes,
performed_by,
)
)
cursor.executemany(
"""
INSERT INTO inventory_transactions (product_id, transaction_type, quantity, previous_quantity,
new_quantity, transaction_date, reference_type, reference_id,
notes, performed_by)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
inventory_data,
)
print("Generating customer support tickets...")
ticket_categories = ["shipping", "product", "payment", "account", "return", "technical", "other"]
priorities = ["low", "medium", "high", "urgent"]
ticket_statuses = ["open", "in_progress", "waiting_customer", "resolved", "closed"]
tickets_data = []
for i in range(10000):
if i % 1000 == 0:
print(f" Generated {i} support tickets...")
user_id = random.randint(1, 15000)
order_id = random.randint(1, 20000) if random.random() < 0.7 else None
ticket_number = f"TICKET-{fake.random_int(min=100000, max=999999)}"
category = random.choice(ticket_categories)
priority = random.choice(priorities)
status = random.choice(ticket_statuses)
subject = fake.catch_phrase()
description = fake.text(max_nb_chars=1000)
created_at = fake.date_time_between(start_date="-6m", end_date="now")
updated_at = fake.date_time_between(start_date=created_at, end_date="now")
resolved_at = (
fake.date_time_between(start_date=updated_at, end_date="now") if status in ["resolved", "closed"] else None
)
assigned_to = fake.name() if status != "open" else None
resolution_notes = fake.text(max_nb_chars=500) if status in ["resolved", "closed"] else None
tickets_data.append(
(
user_id,
order_id,
ticket_number,
category,
priority,
status,
subject,
description,
created_at,
updated_at,
resolved_at,
assigned_to,
resolution_notes,
)
)
cursor.executemany(
"""
INSERT INTO customer_support_tickets (user_id, order_id, ticket_number, category, priority,
status, subject, description, created_at, updated_at,
resolved_at, assigned_to, resolution_notes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
tickets_data,
)
print("Creating indexes...")
cursor.execute("CREATE INDEX age_idx on users (age)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_orders_user_id ON orders(user_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_orders_status ON orders(status)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_order_items_order_id ON order_items(order_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_order_items_product_id ON order_items(product_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_reviews_product_id ON reviews(product_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_reviews_user_id ON reviews(user_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_inventory_product_id ON inventory_transactions(product_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tickets_user_id ON customer_support_tickets(user_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tickets_status ON customer_support_tickets(status)")
conn.commit()
# Print summary statistics
print("\n=== Database Generation Complete ===")
print(f"Users: {cursor.execute('SELECT COUNT(*) FROM users').fetchone()[0]:,}")
print(f"Products: {cursor.execute('SELECT COUNT(*) FROM products').fetchone()[0]:,}")
print(f"Orders: {cursor.execute('SELECT COUNT(*) FROM orders').fetchone()[0]:,}")
print(f"Order Items: {cursor.execute('SELECT COUNT(*) FROM order_items').fetchone()[0]:,}")
print(f"Reviews: {cursor.execute('SELECT COUNT(*) FROM reviews').fetchone()[0]:,}")
print(f"Inventory Transactions: {cursor.execute('SELECT COUNT(*) FROM inventory_transactions').fetchone()[0]:,}")
print(f"Support Tickets: {cursor.execute('SELECT COUNT(*) FROM customer_support_tickets').fetchone()[0]:,}")
# Calculate approximate database size
cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
size_bytes = cursor.fetchone()[0]
print(f"\nApproximate database size: {size_bytes / (1024 * 1024):.2f} MB")
conn.close()
print("\nDatabase created successfully!")