# MIT License
# Copyright (c) 2025 aiquniq
# See LICENSE file in the project root for full license text.

import os, html, zipfile, io, csv, json, shutil, datetime as dt
import streamlit as st

from utils.text import slugify, timestamp
from storage import db as dbmod

# Optional (for "Recent collected" panel)
try:
    import pandas as pd
except Exception:
    pd = None

# config
st.set_page_config(page_title="ROTmap", page_icon="🍇", layout="wide")
st.title("aiquniq ROTmap")
st.caption("Low friction data collector for research")

st.session_state.setdefault("jobs_queue", [])      
st.session_state.setdefault("stop_queue", False)
st.session_state.setdefault("log_lines", [])
st.session_state.setdefault("staged", [])          # last staged batch (any source)
st.session_state.setdefault("staged_source", None) # source name for staged batch
st.session_state.setdefault("last_label", None)    # label for staged batch
st.session_state.setdefault("ads_token", "")       # keep token across reruns

# stramlit top ui hide
st.markdown("""
<style>
div[data-testid="stToolbar"] { display: none !important; }
#MainMenu { visibility: hidden; }   /* hide "⋮" menu */
footer { visibility: hidden; }      /* optional: hide footer */
</style>
""", unsafe_allow_html=True)

# Logging helpers (small, visible)
log_box = st.empty()

def _render_log():
    last = st.session_state["log_lines"][-6:]
    content = html.escape("\n".join(last))
    log_box.markdown(
        f"""
        <div style="height:120px; overflow:auto; background:#0E1117; color:#EAEAEA;
                    border:1px solid #424242; border-radius:10px; padding:12px;">
          <pre style="margin:0; white-space:pre-wrap;
                      font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono','Courier New', monospace;">{content}</pre>
        </div>
        """,
        unsafe_allow_html=True
    )

def push(msg: str):
    st.session_state["log_lines"].append(str(msg))
    _render_log()

_render_log()

# Export helper
def do_export(staged_rows, source_label, ua_string, label_for_name):
    """Downloads PDFs for staged rows, writes manifest.csv/json, zips to exports/, cleans tmp."""
    if not staged_rows:
        st.warning("No staged items to export.")
        return 0, None

    import requests
    export_root = "exports"
    os.makedirs(export_root, exist_ok=True)
    base_label = slugify(label_for_name or "job")
    export_name = f"{(source_label or 'source').lower().replace(' ', '-')}_{base_label}_{timestamp()}"
    export_zip = os.path.join(export_root, f"{export_name}.zip")

    tmp_dir = os.path.join("data", "tmp_export", export_name)
    files_dir = os.path.join(tmp_dir, "files")
    os.makedirs(files_dir, exist_ok=True)

    saved = 0
    manifest_rows = []

    sess = requests.Session()
    sess.headers.update({"User-Agent": ua_string})

    from storage.files import stream_download
    with st.status("Downloading PDFs for export…", expanded=True) as status:
        for i, rec in enumerate(staged_rows, 1):
            pdf_url = rec.get("pdf_url")
            title = (rec.get("title") or rec.get("id") or f"paper_{i}")[:120]
            base = slugify(title)
            if not pdf_url:
                continue
            path = stream_download(pdf_url, files_dir, base, sess, require_pdf=True)
            if not path:
                continue
            saved += 1
            manifest_rows.append({
                "source": rec.get("source"),
                "id": rec.get("id"),
                "title": rec.get("title"),
                "authors": rec.get("authors"),
                "year": rec.get("publication_year"),
                "pdf_url": rec.get("pdf_url"),
                "html_url": rec.get("html_url"),
                "file_path": os.path.relpath(path, start=tmp_dir),
                "search_keyword": rec.get("search_keyword"),
            })
            if saved % 25 == 0:
                status.update(label=f"Downloaded {saved} PDFs…")

        with open(os.path.join(tmp_dir, "manifest.json"), "w", encoding="utf-8") as f:
            json.dump(manifest_rows, f, ensure_ascii=False, indent=2)
        if manifest_rows:
            with open(os.path.join(tmp_dir, "manifest.csv"), "w", encoding="utf-8", newline="") as f:
                w = csv.DictWriter(f, fieldnames=list(manifest_rows[0].keys()))
                w.writeheader(); w.writerows(manifest_rows)

        with zipfile.ZipFile(export_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
            for root, _, files in os.walk(tmp_dir):
                for fn in files:
                    full = os.path.join(root, fn)
                    rel = os.path.relpath(full, start=os.path.dirname(tmp_dir))
                    zf.write(full, arcname=rel)

        status.update(label=f"Export created: {export_zip}", state="complete")

    shutil.rmtree(tmp_dir, ignore_errors=True)
    st.success(f"Exported {saved} PDFs to `{export_zip}`")
    return saved, export_zip

# sidebar
with st.sidebar:
    st.header("Settings")

    data_source = st.selectbox(
        "Data source",
        ["OpenAlex", "NASA ADS", "arXiv"],
        index=0
    )

    user_agent = st.text_input("User-Agent", value="aiquniq/ROTmap_open/v1.1")
    contact_email = st.text_input("Contact Email", value="", placeholder="you@example.com")
    ua = f"{user_agent.strip()} ({contact_email.strip()})" if contact_email.strip() else user_agent.strip()

    # Polite pacing
    if data_source == "arXiv":
        pause = st.number_input("Request pause", min_value=0.3, max_value=1.0, value=0.3, step=0.1)
    else:
        pause = st.number_input("Request pause", min_value=0.2, max_value=1.5, value=0.2, step=0.1)

    # ADS token
    if data_source == "NASA ADS":
        ads_token = st.text_input(
            "ADS API token",
            value=st.session_state.ads_token,
            type="password",
            help="Paste your NASA/SAO ADS API token here. Stored only in session memory."
        )
        st.session_state.ads_token = ads_token
    else:
        ads_token = None

    st.markdown("---")
    st.subheader("Presets")
    try:
        from utils.presets import PRESETS
    except Exception:
        PRESETS = {
            "Quick":    {"crawl_depth": 1, "files_per_run": 100},
            "Standard": {"crawl_depth": 1, "files_per_run": 500},
            "Deep":     {"crawl_depth": 2, "files_per_run": 2500},
        }
    preset_choice = st.selectbox("Crawler preset", options=["Custom", "Quick", "Standard", "Deep"], index=1)

    if preset_choice != "Custom":
        crawl_depth_default = PRESETS[preset_choice]["crawl_depth"]
        files_per_run_default = PRESETS[preset_choice]["files_per_run"]
    else:
        crawl_depth_default = 1
        files_per_run_default = 250

    run_mode = st.radio(
        "Run mode",
        ["Stage only", "Save now"],
        index=0 if data_source in ["NASA ADS", "arXiv", "OpenAlex"] else 1,
        help="Stage only: collect metadata with links now and export later. Save now: download PDFs and write DB/files during the run."
    )

    st.markdown("---")
    st.subheader("Run parameters")
    search_keyword = st.text_input("Keyword search (title/abstract/etc.)", value="machine learning")
    topic_query = st.text_input("Topic (concept lookup)", value="artificial intelligence")

    # ADS parameters
    if data_source == "NASA ADS":
        ads_databases = st.multiselect("Database(s)", options=["astronomy", "physics"], default=["astronomy"])
        ads_doctypes  = st.multiselect("Doctype", options=["article","eprint","inproceedings"], default=["article","eprint"])
        ads_bibstem   = st.text_input("Journal bibstem(s)", value="", help="Comma-separated, e.g., ApJ,MNRAS,AJ")
        ads_extra_q   = st.text_input("Advanced ADS query (optional)", value="")
        colad1, colad2 = st.columns(2)
        with colad1: ads_refereed   = st.checkbox("Refereed only", value=True)
        with colad2: ads_openaccess = st.checkbox("Open access", value=False)

    crawl_depth  = st.number_input("Crawl depth", min_value=1, max_value=4, value=crawl_depth_default, step=1)
    files_per_run = st.number_input("Range", min_value=1, max_value=50000, value=files_per_run_default, step=10)

    coly1, coly2 = st.columns(2)
    with coly1: year_from = st.number_input("From year", min_value=1800, max_value=2100, value=2015, step=1)
    with coly2: year_to   = st.number_input("To year",   min_value=1800, max_value=2100, value=2025, step=1)

    base_job = {
        "source": data_source,
        "label": (search_keyword or topic_query or "job").strip(),
        "keyword": (search_keyword or "").strip(),
        "topic": (topic_query or "").strip(),
        "depth": int(crawl_depth),
        "files": int(files_per_run),
        "year_from": int(year_from) if year_from else None,
        "year_to": int(year_to) if year_to else None,
        "pause": float(pause),
        "run_mode": run_mode,
    }
    
    if data_source == "NASA ADS":
        base_job.update({
            "ads_token": st.session_state.ads_token.strip(),
            "ads_databases": ads_databases,
            "ads_refereed": ads_refereed,
            "ads_openaccess": ads_openaccess,
            "ads_bibstem": [s.strip() for s in ads_bibstem.split(',') if s.strip()] if ads_bibstem else None,
            "ads_doctypes": ads_doctypes,
            "ads_extra_q": (ads_extra_q.strip() or None),
        })

    if st.button("Add to Queue", use_container_width=True):
        if base_job["source"] == "OpenAlex" and not contact_email:
            st.error("OpenAlex requires a contact email.")
        elif base_job["source"] == "NASA ADS" and not base_job.get("ads_token"):
            st.error("NASA ADS requires an API token.")
        elif base_job["source"] == "arXiv" and not contact_email:
            st.error("Please provede a contact email.")
        else:
            st.session_state["jobs_queue"].append(base_job)
            st.success(f"Queued: [{base_job['source']}] {base_job['label']}")

    st.markdown("---")
    if st.session_state["jobs_queue"]:
        st.text("Database controls are disabled.")
    else:
        st.subheader("Database controls")
        
        if st.button("💤 Research done", use_container_width=True, help="Deletes the main environment data."):
            dbmod.wipe_database()
            st.success("Main database and files wiped.")


        if st.button("💿 Export data", use_container_width=True, help="Exports all downloaded data then wipes the main branch."):
            dbmod.ensure_dirs()
            export_name = f"{slugify(search_keyword)}_{timestamp()}"
            export_zip = os.path.join("exports", f"{export_name}.zip")
            with zipfile.ZipFile(export_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
                if os.path.exists("data/main.db"):
                    zf.write("data/main.db", arcname=f"{export_name}/main.db")
                files_dir = "data/files"
                if os.path.isdir(files_dir):
                    for root, dirs, files in os.walk(files_dir):
                        for f in files:
                            p = os.path.join(root, f)
                            rel = os.path.relpath(p, start="data")
                            zf.write(p, arcname=f"{export_name}/{rel}")
            dbmod.wipe_database()
            st.success(f"Exported and wiped. Archive created at: `{export_zip}`")


        if st.button("💿 Download staged Items", use_container_width=True, help="Downloads and exports all staged data."):
            staged = st.session_state.get("staged") or []
            source = (st.session_state.get("staged_source") or "source")
            label = st.session_state.get("last_label") or (search_keyword or topic_query)
            if not staged:
                st.warning("No staged items to export. Run Collect first.")
            else:
                do_export(staged, source, ua, label)

# Queue controls
st.markdown("---")
st.subheader("Runner")

if st.session_state["jobs_queue"]:
    for i, job in enumerate(st.session_state["jobs_queue"], start=1):
        qc1, qc2, qc3, qc4, qc5 = st.columns([3,4,1,2,2])
        qc1.markdown(f"**{i}. [{job['source']}] {job['label']}**")
        yc = f"{job['year_from'] or '—'}–{job['year_to'] or '—'}"
        qc2.caption(f"KW: {job['keyword'] or '—'} | Topic: {job['topic'] or '—'} | Years: {yc} | Mode: {job['run_mode']}")
        qc3.caption(f"Depth: {job['depth']}")
        qc4.caption(f"Files/run: {job['files']}")
        if qc5.button("🗑 Remove", key=f"rm_job_{i}"):
            st.session_state["jobs_queue"].pop(i-1)

    run_cols = st.columns([1,1,1])
    with run_cols[0]:
        run_queue = st.button("▶ Run Queue", use_container_width=True)
    with run_cols[1]:
        clear_queue = st.button("Clear Queue", use_container_width=True)
    with run_cols[2]:
        if st.button("Stop", use_container_width=True):
            st.session_state["stop_queue"] = True
            st.info("Will stop after the current job.")
    if clear_queue:
        st.session_state["jobs_queue"] = []
        st.success("Queue cleared.")

    if run_queue and st.session_state["jobs_queue"]:

        from connect.alex.client import OpenAlexClient
        from connect.alex.crawler_index import crawl_openalex_index_only
        from connect.alex.crawler import crawl_and_collect as crawl_openalex_save

        from connect.ads.client import ADSClient
        from connect.ads.crawler import crawl_ads_index_only, crawl_and_collect_ads

        from connect.arxiv.client import ArxivClient
        from connect.arxiv.crawler import crawl_arxiv_index_only
        from connect.arxiv.crawler_save import crawl_and_collect_arxiv_save

        total = len(st.session_state["jobs_queue"])
        for idx, job in enumerate(list(st.session_state["jobs_queue"])):
            push(f"Job {idx+1}/{total} [{job['source']}] — {job['label']}")

            try:
                if job["source"] == "OpenAlex":
                    if not contact_email:
                        st.error("OpenAlex job missing contact email.")
                        break
                    client = OpenAlexClient(user_agent=user_agent, contact_email=contact_email, pause=job["pause"])

                    if job["run_mode"].startswith("Save now"):
                        saved = crawl_openalex_save(
                            client=client,
                            search_keyword=job["keyword"] or None,
                            topic_query=job["topic"] or None,
                            crawl_depth=int(job["depth"]),
                            files_per_run=int(job["files"]),
                            require_pdf=True,
                            year_from=job["year_from"],
                            year_to=job["year_to"],
                            progress=push,
                        )
                        push(f"OpenAlex saved {saved} PDFs — exporting DB/files + wipe")
                        dbmod.ensure_dirs()
                        export_base = f"openalex_{slugify(job['label'])}_{timestamp()}"
                        export_zip = os.path.join("exports", f"{export_base}.zip")
                        with zipfile.ZipFile(export_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
                            if os.path.exists("data/main.db"):
                                zf.write("data/main.db", arcname=f"{export_base}/main.db")
                            files_dir = "data/files"
                            if os.path.isdir(files_dir):
                                for root, dirs, files in os.walk(files_dir):
                                    for f in files:
                                        p = os.path.join(root, f)
                                        rel = os.path.relpath(p, start="data")
                                        zf.write(p, arcname=f"{export_base}/{rel}")
                        dbmod.wipe_database()
                        st.success(f"Exported job {idx+1}/{total}: {export_zip} and wiped DB/files.")
                    else:
                        staged = crawl_openalex_index_only(
                            client=client,
                            search_keyword=job["keyword"] or None,
                            topic_query=job["topic"] or None,
                            crawl_depth=int(job["depth"]),
                            files_per_run=int(job["files"]),
                            year_from=job["year_from"],
                            year_to=job["year_to"],
                            progress=push,
                        )
                        st.session_state.staged = staged
                        st.session_state.staged_source = "OpenAlex"
                        st.session_state.last_label = job["label"]
                        push(f"OpenAlex staged {len(staged)} — exporting staged zip")
                        do_export(staged, "OpenAlex", ua, job["label"])

                elif job["source"] == "NASA ADS":
                    token = job.get("ads_token")
                    if not token:
                        st.error("ADS job missing token.")
                        break
                    client = ADSClient(api_token=token, user_agent=user_agent, pause=job["pause"])

                    if job["run_mode"].startswith("Save now"):
                        saved = crawl_and_collect_ads(
                            client=client,
                            search_keyword=job["keyword"] or None,
                            crawl_depth=int(job["depth"]),
                            files_per_run=int(job["files"]),
                            year_from=job["year_from"],
                            year_to=job["year_to"],
                            databases=job.get("ads_databases"),
                            refereed_only=job.get("ads_refereed", False),
                            open_access_only=job.get("ads_openaccess", False),
                            bibstems=job.get("ads_bibstem"),
                            doctypes=job.get("ads_doctypes"),
                            extra_query=job.get("ads_extra_q"),
                            progress=push,
                        )
                        push(f"ADS saved {saved} PDFs — exporting DB/files + wipe")
                        dbmod.ensure_dirs()
                        export_base = f"nasa-ads_{slugify(job['label'])}_{timestamp()}"
                        export_zip = os.path.join("exports", f"{export_base}.zip")
                        with zipfile.ZipFile(export_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
                            if os.path.exists("data/main.db"):
                                zf.write("data/main.db", arcname=f"{export_base}/main.db")
                            files_dir = "data/files"
                            if os.path.isdir(files_dir):
                                for root, dirs, files in os.walk(files_dir):
                                    for f in files:
                                        p = os.path.join(root, f)
                                        rel = os.path.relpath(p, start="data")
                                        zf.write(p, arcname=f"{export_base}/{rel}")
                        dbmod.wipe_database()
                        st.success(f"Exported job {idx+1}/{total}: {export_zip} and wiped DB/files.")
                    else:
                        from connect.ads.crawler import crawl_ads_index_only
                        staged = crawl_ads_index_only(
                            client=client,
                            search_keyword=job["keyword"] or None,
                            crawl_depth=int(job["depth"]),
                            files_per_run=int(job["files"]),
                            year_from=job["year_from"],
                            year_to=job["year_to"],
                            databases=job.get("ads_databases"),
                            refereed_only=job.get("ads_refereed", False),
                            open_access_only=job.get("ads_openaccess", False),
                            bibstems=job.get("ads_bibstem"),
                            doctypes=job.get("ads_doctypes"),
                            extra_query=job.get("ads_extra_q"),
                            progress=push,
                        )
                        st.session_state.staged = staged
                        st.session_state.staged_source = "NASA ADS"
                        st.session_state.last_label = job["label"]
                        push(f"ADS staged {len(staged)} — exporting staged zip")
                        do_export(staged, "NASA ADS", ua, job["label"])

                else:
                    if not contact_email:
                        st.error("arXiv job missing contact email.")
                        break
                    
                    client = ArxivClient(user_agent=ua, pause=job["pause"])
                    
                    if job["run_mode"].startswith("Save now"):
                        from connect.arxiv.crawler_save import crawl_and_collect_arxiv_save
                        saved = crawl_and_collect_arxiv_save(
                            client=client,
                            search_keyword=job["keyword"] or None,
                            categories=job["topic"] or None,
                            crawl_depth=int(job["depth"]),
                            files_per_run=int(job["files"]),
                            year_from=job["year_from"],
                            year_to=job["year_to"],
                            progress=push,
                        )
                        push(f"arXiv saved {saved} PDFs — exporting DB/files + wipe")
                        dbmod.ensure_dirs()
                        export_base = f"arxiv_{slugify(job['label'])}_{timestamp()}"
                        export_zip = os.path.join("exports", f"{export_base}.zip")
                        with zipfile.ZipFile(export_zip, "w", compression=zipfile.ZIP_DEFLATED) as zf:
                            if os.path.exists("data/main.db"):
                                zf.write("data/main.db", arcname=f"{export_base}/main.db")
                            files_dir = "data/files"
                            if os.path.isdir(files_dir):
                                for root, dirs, files in os.walk(files_dir):
                                    for f in files:
                                        p = os.path.join(root, f)
                                        rel = os.path.relpath(p, start="data")
                                        zf.write(p, arcname=f"{export_base}/{rel}")
                        dbmod.wipe_database()
                        st.success(f"Exported job {idx+1}/{total}: {export_zip} and wiped DB/files.")
                    else:
                        staged = crawl_arxiv_index_only(
                            client=client,
                            search_keyword=job["keyword"] or None,
                            categories=job["topic"] or None,
                            crawl_depth=int(job["depth"]),
                            files_per_run=int(job["files"]),
                            year_from=job["year_from"],
                            year_to=job["year_to"],
                            progress=push,
                        )
                        st.session_state.staged = staged
                        st.session_state.staged_source = "arXiv"
                        st.session_state.last_label = job["label"]
                        push(f"arXiv staged {len(staged)} — exporting staged zip")
                        do_export(staged, "arXiv", ua, job["label"])

            except Exception as e:
                st.error(f"Job failed ({job['label']}): {e}")

            if st.session_state["jobs_queue"]:
                st.session_state["jobs_queue"].pop(0)

            if st.session_state.get("stop_queue"):
                push("Stop requested — ending queue after this job.")
                st.session_state["stop_queue"] = False
                break

        push("Queue finished.")
        st.toast("Queue complete.", icon="✅")
else:
    start_clicked = st.button("🚀 Start collection", use_container_width=True)
    if start_clicked:
        dbmod.ensure_dirs()

        if data_source == "OpenAlex":
            if not contact_email:
                st.error("Please provide a contact email to comply with OpenAlex policy.")
            else:
                from connect.alex.client import OpenAlexClient
                from connect.alex.crawler_index import crawl_openalex_index_only
                from connect.alex.crawler import crawl_and_collect as crawl_openalex_save

                client = OpenAlexClient(user_agent=user_agent, contact_email=contact_email, pause=pause)

                if run_mode.startswith("Save now"):
                    with st.status("Collecting full-text PDFs (OpenAlex save-now)…", expanded=True) as status:
                        saved = crawl_openalex_save(
                            client=client,
                            search_keyword=search_keyword.strip() or None,
                            topic_query=topic_query.strip() or None,
                            crawl_depth=int(crawl_depth),
                            files_per_run=int(files_per_run),
                            require_pdf=True,
                            year_from=int(year_from) if year_from else None,
                            year_to=int(year_to) if year_to else None,
                            progress=push,
                        )
                        status.update(label=f"Done. Saved {saved} PDFs (OpenAlex).", state="complete")
                    st.toast(f"Saved {saved} PDFs (OpenAlex).", icon="✅")
                else:
                    with st.status("Collecting (staging only) from OpenAlex…", expanded=True) as status:
                        staged = crawl_openalex_index_only(
                            client=client,
                            search_keyword=search_keyword.strip() or None,
                            topic_query=topic_query.strip() or None,
                            crawl_depth=int(crawl_depth),
                            files_per_run=int(files_per_run),
                            year_from=int(year_from) if year_from else None,
                            year_to=int(year_to) if year_to else None,
                            progress=push,
                        )
                        st.session_state.staged = staged
                        st.session_state.staged_source = "OpenAlex"
                        st.session_state.last_label = (search_keyword or topic_query or "job").strip()
                        status.update(label=f"Staged {len(staged)} OpenAlex items. Ready to export.", state="complete")
                    st.toast(f"Staged {len(staged)} items (OpenAlex).", icon="📦")

        elif data_source == "NASA ADS":
            if not ads_token:
                st.error("Please provide an ADS API token to use the NASA ADS connector.")
            else:
                from connect.ads.client import ADSClient
                from connect.ads.crawler import crawl_ads_index_only, crawl_and_collect_ads

                ads_client = ADSClient(api_token=ads_token, user_agent=user_agent, pause=pause)

                if run_mode.startswith("Save now"):
                    with st.status("Collecting full-text PDFs (ADS save-now)…", expanded=True) as status:
                        saved = crawl_and_collect_ads(
                            client=ads_client,
                            search_keyword=search_keyword.strip() or None,
                            crawl_depth=int(crawl_depth),
                            files_per_run=int(files_per_run),
                            year_from=int(year_from) if year_from else None,
                            year_to=int(year_to) if year_to else None,
                            databases=ads_databases if 'ads_databases' in locals() else None,
                            refereed_only=ads_refereed if 'ads_refereed' in locals() else False,
                            open_access_only=ads_openaccess if 'ads_openaccess' in locals() else False,
                            bibstems=[s.strip() for s in ads_bibstem.split(',') if s.strip()] if 'ads_bibstem' in locals() and ads_bibstem else None,
                            doctypes=ads_doctypes if 'ads_doctypes' in locals() else None,
                            extra_query=(ads_extra_q.strip() or None) if 'ads_extra_q' in locals() else None,
                            progress=push,
                        )
                        status.update(label=f"Done. Saved {saved} PDFs (ADS).", state="complete")
                    st.toast(f"Saved {saved} PDFs (ADS).", icon="✅")
                else:
                    with st.status("Collecting (staging only) from NASA ADS…", expanded=True) as status:
                        staged = crawl_ads_index_only(
                            client=ads_client,
                            search_keyword=search_keyword.strip() or None,
                            crawl_depth=int(crawl_depth),
                            files_per_run=int(files_per_run),
                            year_from=int(year_from) if year_from else None,
                            year_to=int(year_to) if year_to else None,
                            databases=ads_databases if 'ads_databases' in locals() else None,
                            refereed_only=ads_refereed if 'ads_refereed' in locals() else False,
                            open_access_only=ads_openaccess if 'ads_openaccess' in locals() else False,
                            bibstems=[s.strip() for s in ads_bibstem.split(',') if s.strip()] if 'ads_bibstem' in locals() and ads_bibstem else None,
                            doctypes=ads_doctypes if 'ads_doctypes' in locals() else None,
                            extra_query=(ads_extra_q.strip() or None) if 'ads_extra_q' in locals() else None,
                            progress=push,
                        )
                        st.session_state.staged = staged
                        st.session_state.staged_source = "NASA ADS"
                        st.session_state.last_label = (search_keyword or topic_query or "job").strip()
                        status.update(label=f"Staged {len(staged)} ADS items. Ready to export.", state="complete")
                    st.toast(f"Staged {len(staged)} items (ADS).", icon="📦")

        else:
            if not contact_email:
                st.error("Please provide a contact email")
            else:
                from connect.arxiv.client import ArxivClient
                from connect.arxiv.crawler import crawl_arxiv_index_only
                from connect.arxiv.crawler_save import crawl_and_collect_arxiv_save

                arxiv_client = ArxivClient(user_agent=ua, pause=pause)
                
                if run_mode.startswith("Save now"):
                    with st.status("Collecting full-text PDFs (arXiv save-now)…", expanded=True) as status:
                        saved = crawl_and_collect_arxiv_save(
                            client=arxiv_client,
                            search_keyword=search_keyword.strip() or None,
                            categories=topic_query.strip() or None,
                            crawl_depth=int(crawl_depth),
                            files_per_run=int(files_per_run),
                            year_from=int(year_from) if year_from else None,
                            year_to=int(year_to) if year_to else None,
                            progress=push,
                        )
                        status.update(label=f"Done. Saved {saved} PDFs (arXiv).", state="complete")
                    st.toast(f"Saved {saved} PDFs (arXiv).", icon="✅")
                else:
                    with st.status("Collecting (staging only) from arXiv…", expanded=True) as status:
                        staged = crawl_arxiv_index_only(
                            client=arxiv_client,
                            search_keyword=search_keyword.strip() or None,
                            categories=topic_query.strip() or None,
                            crawl_depth=int(crawl_depth),
                            files_per_run=int(files_per_run),
                            year_from=int(year_from) if year_from else None,
                            year_to=int(year_to) if year_to else None,
                            progress=push,
                        )
                        st.session_state.staged = staged
                        st.session_state.staged_source = "arXiv"
                        st.session_state.last_label = (search_keyword or topic_query or "job").strip()
                        status.update(label=f"Staged {len(staged)} arXiv items. Ready to export.", state="complete")
                    st.toast(f"Staged {len(staged)} items (arXiv).", icon="📦")

st.markdown("---")
if run_mode.startswith("Save now"):
    try:
        conn = dbmod.connect()
        rows = list(dbmod.iter_recent(conn, limit=200))
        conn.close()
        if rows:
            hc1, hc2, hc3 = st.columns([8,2,2])
            hc1.markdown("**Title**"); hc2.markdown("**Year**"); hc3.markdown("**Open**")
            for r in rows:
                c1, c2, c3 = st.columns([8,2,2])
                title   = r.get("title") or "(untitled)"
                year    = r.get("publication_year") or "—"
                pdf_url = r.get("pdf_url"); html_url = r.get("html_url")
                openalex_id = r.get("id")
                c1.markdown(f"{title}"); c2.markdown(f"{year}")
                url = pdf_url or html_url or (openalex_id if isinstance(openalex_id, str) and openalex_id.startswith("https://") else None)
                if url:
                    try: c3.link_button("Open", url)
                    except Exception: c3.markdown(f"[Open]({url})")
                else:
                    c3.write("—")
        else:
            st.info("No data yet. Run a collection first.")
    except Exception as e:
        st.error(f"Error reading database: {e}")
else:
    _staged = st.session_state.get("staged") or []
    if _staged:
        src = st.session_state.get("staged_source") or "—"
        lbl = st.session_state.get("last_label") or "job"
        st.caption(f"{len(_staged)} item(s) staged — [{src}] {lbl}")

        sc1, sc2, sc3, sc4, sc5 = st.columns([3,1,5,0.5,2])
        sc1.markdown("**Title**")
        sc2.markdown("**Year**")
        sc3.markdown("**Will save as**")
        sc4.markdown("**PDF?**")
        sc5.markdown("**Open**")

        for i, r in enumerate(_staged, 1):
            c1, c2, c3, c4, c5 = st.columns([3,1,5,0.5,2])

            title = (r.get("title") or r.get("id") or f"paper_{i}").strip()
            year = r.get("publication_year") or "—"
            pdf_url = r.get("pdf_url")
            html_url = r.get("html_url")

            base = slugify(title)[:120] or f"paper_{i}"
            will_name = (base + ".pdf") if pdf_url else "—"

            c1.markdown(title)
            c2.markdown(str(year))
            c3.code(will_name)
            c4.markdown("✅" if pdf_url else "—")

            url = pdf_url or html_url or r.get("id")
            if isinstance(url, str) and url:
                try:
                    c5.link_button("Open", url)
                except Exception:
                    c5.markdown(f"[Open]({url})")
            else:
                c5.write("—")
    else:
        st.info("Nothing to show yet. Run a collection first.")

st.markdown("---")
st.caption("Global run mode controls whether we stage only or save now. Queue respects per-job mode and pause.")