#!/usr/bin/env python3
"""Round 2: Ingest fresh SaaS opportunities from aggregate source analysis."""
import sqlite3, os, json

DB = os.path.join("/root/workspace", "saas_opportunities.db")
conn = sqlite3.connect(DB)
conn.row_factory = sqlite3.Row

# --- Check what's already in DB to avoid dupes ---
existing_props = set(
    r["value_proposition"].lower().strip()
    for r in conn.execute("SELECT value_proposition FROM saas_opportunities").fetchall()
)

def is_dupe(name):
    n = name.lower().strip()
    for e in existing_props:
        # Check for significant word overlap
        n_words = set(n.split()[:5])  # first 5 words as fingerprint
        e_words = set(e.split()[:5])
        if len(n_words & e_words) >= 3:
            return True
    return False

# ====== NEW RAW SIGNALS (from LaunchSaaS, NicheCheck, StartupIdeasDB, PainFinder, Superframeworks) ======
new_signals = [
    # 1 — AI Legal-Doc Reviewer for Freelancers (StartupIdeasDB, LaunchSaaS)
    (
        1,
        "aggregate", "https://startupideasdb.com/blog/startup-ideas-2026-validated-list",
        "r/freelance - Freelancers sign bad contracts, lose money",
        '"Got screwed by my contract" threads on r/freelance are a recurring theme. Freelancers sign client contracts without legal review because hiring a lawyer costs $300-500/hr.',
        "Freelancers lack affordable legal review for client contracts they sign",
        "Freelancer / Solopreneur",
        "Ignore it, hire lawyer at $300-500/hr, or sign blind", 4
    ),
    # 2 — Returns Analysis Dashboard for E-commerce (NicheCheck)
    (
        2,
        "aggregate", "https://nichecheck.com/blog/micro-saas-ideas-2026",
        "NicheCheck - Returns analysis gap for small stores",
        'E-commerce stores with $50K+/mo have no affordable tool to analyze return patterns. Existing tools (Returnly, Loop) focus on processing returns, not analyzing WHY customers return items.',
        "Small e-commerce stores have no visibility into WHY customers return items",
        "E-commerce owner ($50K-$500K/mo revenue)",
        "Manual spreadsheet analysis, no insights", 4
    ),
    # 3 — Discord Analytics Dashboard (PainFinder, cross-referenced)
    (
        3,
        "aggregate", "https://www.pain-finder.com/blog/micro-saas-ideas-2025",
        "r/Discord - Server owners want analytics",
        'Discord server owners want to know which channels are dead, who top contributors are, when engagement peaks. No simple tool exists — only complex server stats bots.',
        "Discord server owners have no simple analytics for engagement, dead channels, and contributors",
        "Discord community manager / Server owner (500+ members)",
        "Manual observation, complex bots", 4
    ),
    # 4 — Freelance Invoicing with Milestones (PainFinder, LaunchSaaS cross-ref)
    (
        4,
        "aggregate", "https://launchsaas.org/blog/micro-saas-ideas-validated-reddit-2026",
        "r/freelance - Project-based invoicing pain",
        'Freelancers who bill by project (not hourly) need milestone-based partial payments with automatic reminders. Existing invoicing tools (FreshBooks, Wave) are built for hourly billing.',
        "Project-based freelancers lack invoicing tools with milestone billing and partial payments",
        "Freelance developer / Designer / Consultant",
        "Custom spreadsheets, manual payment follow-ups", 4
    ),
    # 5 — YouTube Thumbnail A/B Tester (NicheCheck, cross-ref Google Trends)
    (
        5,
        "aggregate", "https://nichecheck.com/blog/micro-saas-ideas-2026",
        "YouTube creators need thumbnail testing",
        'YouTube does not offer native thumbnail A/B testing. Creators with 10K+ subs want to test which thumbnails drive highest CTR. Existing tools (TubeBuddy) are bloated and expensive.',
        "YouTube creators (10K+ subs) have no simple thumbnail A/B testing tool",
        "YouTube content creator / Channel manager",
        "Manual swap-and-check, TubeBuddy (bloated)", 3
    ),
    # 6 — API Changelog Monitor (NicheCheck, Superframeworks cross-ref)
    (
        6,
        "aggregate", "https://superframeworks.com/articles/profitable-micro-saas-niches",
        "Devs need API breaking change alerts",
        'Dev teams using 5+ third-party APIs have no tool that monitors API docs/endpoints for breaking changes. Monitoring tools focus on uptime, not schema changes. $8M TAM.',
        "Engineering teams lack alerts when third-party APIs change their schemas or endpoints",
        "Developer / Engineering lead (5+ API integrations)",
        "Manual testing, reactive debugging when API breaks", 4
    ),
    # 7 — Client Portal for Agencies (PainFinder, LaunchSaaS)
    (
        7,
        "aggregate", "https://www.pain-finder.com/blog/micro-saas-ideas-2025",
        "Agencies need one place for client files/status",
        'Agencies need a single white-label portal where clients can see project status, files, deliverables, and invoices. Existing solutions scattered across Slack, email, Google Drive.',
        "Digital agencies lack a simple white-label client portal for status/files/invoices",
        "Agency owner / Project manager (3-20 clients)",
        "Scattered across Slack, email, Google Drive, Asana", 3
    ),
    # 8 — Supplier Price Tracker (NicheCheck)
    (
        8,
        "aggregate", "https://nichecheck.com/blog/micro-saas-ideas-2026",
        "E-com businesses need supplier price monitoring",
        'Businesses sourcing from Alibaba, Thomasnet have no tool to automatically track price drops and new suppliers. Manual checking costs hours weekly. $15M TAM, low competition.',
        "E-commerce importers have no automated supplier price drop / new supplier alerts",
        "E-commerce sourcing manager / Importer",
        "Manual Alibaba browsing, spreadsheets", 3
    ),
    # 9 — Fake Amazon Review Detector (StartupIdeasDB)
    (
        9,
        "aggregate", "https://startupideasdb.com/blog/startup-ideas-2026-validated-list",
        "2.3M Amazon shoppers complain about fake reviews",
        '2.3M Amazon shoppers in r/Amazon complain about review manipulation. No simple browser extension flags fake reviews using purchase history patterns.',
        "Amazon shoppers cannot easily identify fake/manipulated product reviews",
        "Amazon online shopper",
        "Manual cross-checking, Fakespot (clunky)", 3
    ),
    # 10 — Podcast Show Notes Generator (NicheCheck)
    (
        10,
        "aggregate", "https://nichecheck.com/blog/micro-saas-ideas-2026",
        "Podcasters waste hours on show notes",
        'Podcasters publishing 1+ episodes/week spend 2-4 hours per episode writing show notes, timestamps, SEO blog posts. AI can generate these from audio. $12M TAM, low competition.',
        "Podcasters waste 2-4 hrs/episode on manual show notes and SEO blog posts",
        "Podcaster (1+ episodes/week)",
        "Manual transcription + writing, expensive VA", 3
    ),
]

# Insert signals
inserted_count = 0
for s in new_signals:
    sid, source, url, title, text, pain_point, persona, workaround, severity = s
    conn.execute(
        "INSERT INTO raw_signals (source, url, title, text, detected_pain_point, persona, workaround, severity) VALUES (?,?,?,?,?,?,?,?)",
        (source, url, title, text, pain_point, persona, workaround, severity)
    )
    inserted_count += 1

# Get the new signal IDs
new_rows = conn.execute(
    "SELECT id, detected_pain_point FROM raw_signals WHERE processed=0 ORDER BY id DESC LIMIT ?",
    (inserted_count,)
).fetchall()
new_signal_map = {r["detected_pain_point"]: r["id"] for r in new_rows}

print(f"Inserted {inserted_count} new raw_signals")
print(f"New signal IDs: {[r['id'] for r in new_rows]}")

# ====== PROMOTE TO OPPORTUNITIES with scoring ======
# Scoring: Pain/5, Potential/5, Distribution/5, Monetization/5 (matching DB schema)
# Total = avg of 4 (same as existing DB)
new_opportunities = [
    # (pain_text_keyword, vertical, value_prop, complexity, potential, distribution, monetization)
    ("lack affordable legal review", "Legal Tech",
     "Freelance Contract Reviewer - AI legal-doc reviewer for freelancers signing client contracts ($19/mo)",
     5, 5, 4, 5),  # S:5, F:5, D:4, M:5

    ("visibility into WHY customers return", "E-commerce/Analytics",
     "Returns Intelligence - pattern analysis dashboard that shows WHY customers return items by store ($49/mo)",
     4, 4, 3, 4),  # S:4, F:4, D:3, M:4

    ("no simple analytics for engagement", "Community/Tools",
     "Discord Pulse - dead channel alerts, top contributors, engagement peaks dashboard ($19/mo)",
     5, 4, 4, 4),  # S:5, F:4, D:4, M:4

    ("milestone billing and partial payments", "Freelance/Finance",
     "Milestone Invoice Pro - project-based invoicing with milestone billing and auto-reminders ($19/mo)",
     4, 4, 5, 5),  # S:4, F:4, D:5, M:5

    ("thumbnail A/B testing tool", "Creator/Tools",
     "ThumbTest - YouTube thumbnail A/B tester with CTR reporting ($15/mo)",
     5, 4, 4, 4),  # S:5, F:4, D:4, M:4

    ("third-party APIs change their schemas", "Developer/Tools",
     "API Sentinel - monitor third-party API docs for breaking schema/endpoint changes ($19/mo)",
     4, 4, 3, 4),  # S:4, F:4, D:3, M:4

    ("white-label client portal for status/files/invoices", "Agency/Tools",
     "AgencyVault - white-label client portal aggregating project status, files, and invoices ($29/mo)",
     4, 4, 5, 4),  # S:4, F:4, D:5, M:4

    ("supplier price drop", "E-commerce/Supply Chain",
     "Supplier Watch - automated Alibaba/Thomasnet price drop and new supplier alerts ($39/mo)",
     4, 4, 3, 4),  # S:4, F:4, D:3, M:4

    ("fake/manipulated product reviews", "E-commerce/Tools",
     "ReviewSanity - browser extension flagging fake Amazon reviews using purchase history patterns ($free/$4.99 premium)",
     4, 3, 5, 3),  # S:4, F:3, D:5, M:3

    ("manual show notes and SEO blog posts", "Creator/Tools",
     "PodNotes AI - auto-generates show notes, timestamps, and SEO blog posts from podcast audio ($19/mo)",
     4, 4, 4, 4),  # S:4, F:4, D:4, M:4
]

imported_opps = 0
for opp in new_opportunities:
    pain_keyword, vertical, value_prop, complexity, potential, distribution, monetization = opp
    sig_id = None
    for text, sid in new_signal_map.items():
        if pain_keyword.lower() in text.lower():
            sig_id = sid
            break
    if sig_id:
        conn.execute(
            "INSERT INTO saas_opportunities (signal_id, vertical, value_proposition, complexity_score, potential_score, distribution_score, monetization_score, status) VALUES (?,?,?,?,?,?,?,'validated')",
            (sig_id, vertical, value_prop, complexity, potential, distribution, monetization)
        )
        conn.execute("UPDATE raw_signals SET processed=1 WHERE id=?", (sig_id,))
        imported_opps += 1
        print(f"  ✓ Promoted: {value_prop[:60]}...")
    else:
        print(f"  ✗ No match for: {pain_keyword}")

conn.commit()

# Summary
sig_total = conn.execute("SELECT COUNT(*) FROM raw_signals").fetchone()[0]
opp_total = conn.execute("SELECT COUNT(*) FROM saas_opportunities").fetchone()[0]
print(f"\n{'='*60}")
print(f"DB now has {sig_total} raw_signals and {opp_total} opportunities")
print(f"{imported_opps} new opportunities promoted this round")

# Print ranked table
print(f"\n{'='*60}")
print("ALL OPPORTUNITIES - RANKED BY TOTAL SCORE")
print(f"{'='*60}")
print(f"{'Rk':>3} {'Vertical':<25} {'Concept':<55} {'Score':>6}")
print(f"{'-'*89}")
rows = conn.execute("""
    SELECT id, vertical, value_proposition, total_score
    FROM saas_opportunities
    ORDER BY total_score DESC
""").fetchall()
for i, r in enumerate(rows, 1):
    print(f"{i:>3} {r['vertical']:<25} {r['value_proposition'][:55]:<55} {r['total_score']:>5.2f}")

conn.close()
