#!/usr/bin/env python3
"""Ingest all validated Reddit signals into SQLite."""
import sqlite3, os

DB = os.path.join("/root/workspace", "saas_opportunities.db")
conn = sqlite3.connect(DB)
conn.row_factory = sqlite3.Row

# --- RAW SIGNALS from Reddit research (15 validated pain points) ---
signals = [
    ("reddit", "https://reddit.com/r/smallbusiness/comments/1lnevis/",
     "r/smallbusiness - Invoice chasing pain",
     "Anyone else spend way too much time chasing invoices? I'm sitting here on a Sunday writing follow-up emails for overdue invoices. Just spent 2 hours this week chasing payments.",
     "SMB owners waste 2+ hrs/week chasing unpaid invoices manually",
     "SMB Owner / Freelancer",
     "Manual email follow-ups, spreadsheets", 4),

    ("reddit", "https://reddit.com/r/Bookkeeping/comments/1kh220h/",
     "r/Bookkeeping - Client categorization pain",
     "The pain of going back and forth with clients to categorize transactions every month. Clients ignore emails and lose spreadsheets.",
     "Bookkeepers spend 2 weeks/month chasing clients to categorize transactions",
     "Bookkeeper",
     "Email back-and-forth, spreadsheets", 5),

    ("reddit", "https://reddit.com/r/smallbusiness/",
     "r/smallbusiness - Trades forgetting to follow up invoices",
     "Plumbers, HVAC, electricians send invoices but forget follow-up. Cash flow suffers even when fully booked. (3,536 upvotes, 314 comments)",
     "Trades businesses lose cash flow because they forget to send invoice payment reminders",
     "Trades business owner (plumber, HVAC, electrician)",
     "Manual phone calls, lost invoices", 5),

    ("reddit", "https://reddit.com/r/PPC/comments/1s1ppxb/",
     "r/PPC - Client reporting getting worse",
     "Client reporting is the only part that's gotten worse. Agencies spend 5-15 hrs/client/month pulling data from Google Ads, Meta, GA4.",
     "Agencies waste 5-15 hrs/client/month on manual reporting across ad platforms",
     "Agency owner / Marketing manager",
     "Looker Studio, Supermetrics, manual Excel", 4),

    ("reddit", "https://reddit.com/r/FulfillmentByAmazon/comments/1qz6nu4/",
     "r/FBA - Amazon overcharging fees",
     "Sick of Amazon's Dimension Inflation - $3-5 per unit overcharges. Sellers lose $3k-$15k/year.",
     "Amazon sellers lose $3k-$15k/year in overcharged fees they cannot easily audit",
     "Amazon FBA Seller",
     "Manual audit, existing tools charge 25% of recovered", 4),

    ("reddit", "https://www.reddit.com/r/Accounting/comments/1rfh0wn/",
     "r/Accounting - Stripe imports into QBO",
     "How are you handling Stripe imports into QBO? Bookkeepers spend 1-3 hrs/month reformatting Stripe CSV.",
     "Bookkeepers waste 1-3 hrs/month per client reformatting Stripe exports for QBO",
     "Bookkeeper / Accountant",
     "Manual CSV reformatting", 4),

    ("reddit", "https://reddit.com/r/smallbusiness/",
     "r/smallbusiness - Sales tax forgotten",
     "Forgot to collect sales tax for 18 months and their state sent them a letter.",
     "SMB owners risk fines from uncollected multi-state sales tax",
     "SMB E-commerce owner",
     "Expensive tools (Avalara), ignore problem", 5),

    ("reddit", "https://reddit.com/r/logistics/comments/1ry6g0a/",
     "r/logistics - 15 hours a week tracking shipments",
     "Spending 15 hours a week manually tracking shipments across 8-10 carrier portals.",
     "Logistics ops teams waste 15+ hrs/week checking multiple carrier portals",
     "Logistics operations manager",
     "Manual portal checking, spreadsheets", 4),

    ("reddit", "https://reddit.com/r/PropertyManagement/",
     "r/PropertyManagement - Small landlord tools",
     "Existing software designed for 100+ unit managers. Small landlords use Google Sheets, email, texts.",
     "Small landlords (2-20 units) have no affordable property management tool",
     "Small landlord",
     "Google Sheets, email, text messages", 4),

    ("reddit", "https://reddit.com/r/digital_marketing/comments/1s439ue/",
     "r/digital_marketing - Brands not visible in AI answers",
     "Why brands rank #1 in Google but get zero mentions in AI answers. No monitoring tool exists.",
     "Brands cannot measure their visibility in ChatGPT, Perplexity, AI Overviews",
     "Digital marketing manager",
     "Manual checking of AI tools, no existing solution", 4),

    ("reddit", "https://reddit.com/r/accounting/",
     "r/accounting - Construction WIP manual",
     "Every month end I am exporting data into Excel and manually calculating percent complete for each job.",
     "Construction accountants waste 8-12 hrs/client/month on manual WIP calculations",
     "Construction accountant",
     "Manual Excel calculations", 4),

    ("reddit", "https://reddit.com/r/SaaS/comments/1q9kb9h/",
     "r/SaaS - Is churn a real problem",
     "Is churn rate a painful problem that they would require someone to help them solve or is it a nice to have?",
     "SaaS founders struggle to predict and reduce churn without dedicated tooling",
     "SaaS Founder ($200k-$2M ARR)",
     "Spreadsheets, manual cohort analysis", 4),

    ("reddit", "https://reddit.com/r/smallbusiness/",
     "r/smallbusiness - Task management too expensive",
     "Managing employee tasks without spending thousands on software feels completely impossible. (47 upvotes, 43 comments)",
     "Small businesses need affordable task management that does not cost thousands",
     "Small business owner",
     "Paper, free tools with limited features", 3),

    ("reddit", "https://reddit.com/r/smallbusiness/",
     "r/smallbusiness - Stripe chargeback fees",
     "Stripe charges you $15 for every chargeback - even if you win. (47 upvotes, 28 comments)",
     "SMBs pay $15/chargeback fee to Stripe even when they win the dispute",
     "SMB owner (e-commerce)",
     "Manual dispute response, eat the cost", 3),
]

for s in signals:
    conn.execute(
        "INSERT INTO raw_signals (source, url, title, text, detected_pain_point, persona, workaround, severity) VALUES (?,?,?,?,?,?,?,?)",
        s
    )

# --- PROMOTE TO OPPORTUNITIES ---
rows = conn.execute("SELECT id, detected_pain_point FROM raw_signals ORDER BY id").fetchall()

signal_text_to_id = {}
for r in rows:
    signal_text_to_id[r["detected_pain_point"]] = r["id"]

# Match opportunities to signals by keyword matching
# (vertical, value_prop, complexity, potential, distribution, monetization, keywords_to_match)
opportunities = [
    ("Finance/Compliance", "Invoice Chaser AI - automated payment reminders with SMB branding",
     5, 4, 4, 5, ["invoice", "chasing", "overdue"]),
    ("Finance/Compliance", "Bookkeeper Transaction Categorization Portal - Tinder-style swipe categorization",
     4, 5, 4, 4, ["categorize", "bookkeeper", "categorization"]),
    ("Trades/Contractors", "Trades Invoice Reminder - SMS+email auto-followup for plumbers/HVAC",
     5, 5, 5, 5, ["trades", "plumber", "forget"]),
    ("Marketing/Agency", "Agency Client Report Generator - narrative PDF reports from ad platform data",
     4, 3, 4, 4, ["agenc", "reporting", "hours"]),
    ("E-commerce", "Amazon FBA Fee Auditor - flat-fee reimbursement claim generator",
     3, 4, 4, 4, ["amazon", "overcharg", "fba"]),
    ("Finance/Compliance", "Stripe-QBO Reconciliator - CSV auto-mapper for QuickBooks import",
     3, 5, 4, 4, ["stripe", "qbo", "reformat"]),
    ("E-commerce/Compliance", "SMB Sales Tax Auto-Pilot - multi-state calculation, collection, filing",
     5, 3, 4, 5, ["sales tax", "tax", "state"]),
    ("Logistics/Supply Chain", "Freight Tracker Dashboard - unified carrier tracking with delay alerts",
     4, 3, 3, 4, ["tracking", "carrier", "portal"]),
    ("Real Estate", "Small Landlord OS - rent, maintenance, leases for 2-20 units",
     3, 4, 4, 4, ["landlord", "property", "sheets"]),
    ("Marketing/AI", "AI Brand Visibility Monitor - track mentions in ChatGPT/Perplexity/AI Overviews",
     4, 3, 4, 5, ["ai", "visibility", "mention"]),
    ("Construction/Compliance", "Construction WIP Auto-Reports - GAAP-compliant schedules from QBO",
     4, 4, 3, 4, ["construction", "wip", "percent"]),
    ("SaaS/Tools", "SaaS Churn Radar - churn prediction + retention playbook from support tickets",
     3, 3, 4, 4, ["churn", "saas", "retention"]),
    ("SMB/Tools", "SMB Task Manager Lite - affordable employee task management ($19/mo)",
     3, 4, 4, 3, ["task", "affordable", "thousands"]),
    ("E-commerce/Finance", "Chargeback Defender - automated dispute response with evidence collection",
     3, 4, 3, 3, ["chargeback", "stripe", "dispute"]),
]

imported = 0
for opp in opportunities:
    vertical, value_prop, complexity, potential, distribution, monetization, keywords = opp
    sig_id = None
    for pain_text, sid in signal_text_to_id.items():
        if any(kw.lower() in pain_text.lower() for kw in keywords):
            sig_id = sid
            break
    if sig_id:
        conn.execute(
            "INSERT INTO saas_opportunities (signal_id, vertical, value_proposition, complexity_score, potential_score, distribution_score, monetization_score, status) VALUES (?,?,?,?,?,?,?,'validated')",
            (sig_id, vertical, value_prop, complexity, potential, distribution, monetization)
        )
        conn.execute("UPDATE raw_signals SET processed=1 WHERE id=?", (sig_id,))
        imported += 1

conn.commit()

count_signals = conn.execute("SELECT COUNT(*) FROM raw_signals").fetchone()[0]
count_opps = conn.execute("SELECT COUNT(*) FROM saas_opportunities").fetchone()[0]
print(f"OK {count_signals} raw_signals imported")
print(f"OK {count_opps} opportunities promoted")
print(f"OK {imported} matched")
conn.close()
