#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, re, time, json, hashlib, queue
import requests, pymysql
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

# ================== CONFIG ==================
DB = dict(host="127.0.0.1", user="root", password="152535ff", database="facial", charset="utf8mb4")
SEAPE_BASE     = "https://seape.df.gov.br"
SEAPE_START    = "https://seape.df.gov.br/foragidos/"
SEAPE_DIR      = "/var/www/html/facial/fugitivos/seape_df"

INSTA_PROFILE  = "portal.dos.procurados"
INSTA_DIR      = "/var/www/html/facial/fugitivos/instagram"
ENABLE_INSTAGRAM = True     # defina False se não quiser Instagram

UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0 Safari/537.36"
TIMEOUT = 25
MAX_PAGES = 2000           # teto para BFS
# ============================================

def db():
    return pymysql.connect(**DB, cursorclass=pymysql.cursors.DictCursor, autocommit=False)

def md5_bytes(b): return hashlib.md5(b).hexdigest()

def norm_txt(t):
    return re.sub(r"\s+", " ", (t or "").strip())

def http_get(url, sess):
    return sess.get(url, timeout=TIMEOUT)

def ensure_source(conn, slug, name, url):
    with conn.cursor() as cur:
        cur.execute("SELECT id FROM fugitivos_fontes WHERE slug=%s", (slug,))
        r = cur.fetchone()
        if r: return r["id"]
        cur.execute("INSERT INTO fugitivos_fontes (slug,nome,url) VALUES (%s,%s,%s)", (slug,name,url))
        conn.commit()
        return cur.lastrowid

def save_image_bytes(base_dir, url, b):
    os.makedirs(base_dir, exist_ok=True)
    h = md5_bytes(b)
    path = urlparse(url).path
    ext = os.path.splitext(path)[1].lower()
    if not ext or len(ext) > 5: ext = ".jpg"
    fn = f"{h}{ext}"
    full = os.path.join(base_dir, fn)
    if not os.path.exists(full):
        with open(full, "wb") as f: f.write(b)
    return h, full

def upsert_person(conn, fonte_id, nome, crimes, url_origem, image_url, image_local, image_md5):
    nome = norm_txt(nome or "")
    crimes = norm_txt(crimes or "")
    with conn.cursor() as cur:
        cur.execute("""SELECT id FROM fugitivos_pessoas
                       WHERE fonte_id=%s AND nome=%s AND (image_md5 <=> %s)""",
                    (fonte_id, nome, image_md5))
        r = cur.fetchone()
        if r:
            cur.execute("""UPDATE fugitivos_pessoas SET crimes=%s, url_origem=%s, image_url=%s, image_local=%s, updated_at=NOW()
                           WHERE id=%s""", (crimes, url_origem, image_url, image_local, r["id"]))
            conn.commit()
            return r["id"], False
        cur.execute("""INSERT INTO fugitivos_pessoas
          (fonte_id,nome,crimes,url_origem,image_url,image_local,image_md5,status)
          VALUES (%s,%s,%s,%s,%s,%s,%s,'novo')""",
          (fonte_id, nome, crimes, url_origem, image_url, image_local, image_md5))
        conn.commit()
        return cur.lastrowid, True

# -------- SEAPE: BFS crawler, coleta TODAS as imagens e heurísticas de nome -------
def seape_collect_all(conn):
    sess = requests.Session(); sess.headers.update({"User-Agent": UA})
    fonte_id = ensure_source(conn, "seape_df", "SEAPE-DF Foragidos", SEAPE_START)

    seen = set()
    q = queue.Queue()
    q.put(SEAPE_START); seen.add(SEAPE_START)
    pages = 0
    new_cnt = upd_cnt = img_cnt = 0

    while not q.empty() and pages < MAX_PAGES:
        url = q.get(); pages += 1
        try:
            r = http_get(url, sess); r.raise_for_status()
        except Exception:
            continue
        soup = BeautifulSoup(r.text, "lxml")

        # 1) coletar img src/srcset
        def all_img_urls(tag):
            urls = []
            src = tag.get("src") or tag.get("data-src") or tag.get("data-lazy-src")
            if src: urls.append(src)
            srcset = tag.get("srcset")
            if srcset:
                for part in srcset.split(","):
                    u = part.strip().split(" ")[0]
                    if u: urls.append(u)
            return [u for u in urls if u]

        # heurística de nome próximo
        def extract_name_near(img_tag):
            # prioridade: alt
            alt = norm_txt(img_tag.get("alt") or "")
            if len(alt.split()) >= 2:  # provável nome completo
                return alt.title()
            # figcaption próxima
            fig = img_tag.find_parent("figure")
            if fig:
                cap = fig.find("figcaption")
                if cap:
                    t = norm_txt(cap.get_text(" "))
                    if 6 <= len(t) <= 120: return t.title()
            # subir no DOM e olhar <p> com nome no início
            parent = img_tag
            for _ in range(3):
                parent = parent.parent
                if not parent: break
                for p in parent.find_all("p"):
                    txt = norm_txt(p.get_text(" "))
                    m = re.match(r"([A-ZÁÂÃÉÊÍÓÔÕÚÇ ]{6,})", txt.upper())
                    if m:
                        nm = norm_txt(m.group(1)).title()
                        return nm
            return "Desconhecido"

        for img in soup.select("img"):
            urls = all_img_urls(img)
            for u in urls:
                if u.startswith("data:"): continue
                absu = urljoin(url, u)
                # baixar
                try:
                    br = http_get(absu, sess); br.raise_for_status()
                    image_md5, local = save_image_bytes(SEAPE_DIR, absu, br.content)
                    nome = extract_name_near(img)
                    crimes = ""  # se existir parágrafo próximo com info, poderia concatenar aqui
                    _id, is_new = upsert_person(conn, fonte_id, nome, crimes, url, absu, local, image_md5)
                    new_cnt += 1 if is_new else 0
                    upd_cnt += 0 if is_new else 1
                    img_cnt += 1
                except Exception:
                    continue

        # 2) enfileirar links internos sob /foragidos/
        for a in soup.select("a[href]"):
            href = a.get("href")
            if not href: continue
            abs_link = urljoin(url, href)
            pr = urlparse(abs_link)
            if pr.netloc.endswith("seape.df.gov.br") and "/foragidos" in pr.path:
                if abs_link not in seen:
                    seen.add(abs_link); q.put(abs_link)

    print(f"SEAPE: paginas={pages}, imgs={img_cnt}, novos={new_cnt}, atualizados={upd_cnt}")

# -------- Instagram: baixa TODOS os posts via instaloader ------------------------
def instagram_collect_all(conn):
    from instaloader import Instaloader, Profile
    fonte_id = ensure_source(conn, "instagram_portal", "Portal dos Procurados (Instagram)", f"https://www.instagram.com/{INSTA_PROFILE}/")

    L = Instaloader(dirname_pattern=INSTA_DIR, filename_pattern="{date_utc}_post",
                    download_videos=False, download_video_thumbnails=False, save_metadata=False, post_metadata_txt_pattern="")
    # Se tiver credenciais, descomente:
    # L.login(os.environ.get("IG_USERNAME",""), os.environ.get("IG_PASSWORD",""))

    profile = Profile.from_username(L.context, INSTA_PROFILE)
    count = new_cnt = upd_cnt = 0
    for post in profile.get_posts():
        count += 1
        # baixa o post (thumbnails/fotos)
        L.download_post(post, target=INSTA_PROFILE)

        # varre arquivos baixados (jpg) na pasta
        folder = os.path.join(INSTA_DIR, INSTA_PROFILE)
        for fn in os.listdir(folder):
            if not fn.lower().endswith(".jpg"): continue
            full = os.path.join(folder, fn)
            try:
                with open(full, "rb") as f: b = f.read()
                h = md5_bytes(b)
                nome_guess = "Desconhecido"
                # heurística opcional: legenda (se houver json sidecar)
                # aqui simplificamos e usamos o nome_guess
                _id, is_new = upsert_person(conn, fonte_id, nome_guess, "", f"https://www.instagram.com/{INSTA_PROFILE}/", full, full, h)
                new_cnt += 1 if is_new else 0
                upd_cnt += 0 if is_new else 1
            except Exception:
                continue
    print(f"INSTAGRAM: posts={count}, novos={new_cnt}, atualizados={upd_cnt}")

def main():
    conn = db()
    # garantir diretórios
    os.makedirs(SEAPE_DIR, exist_ok=True)
    os.makedirs(INSTA_DIR, exist_ok=True)

    seape_collect_all(conn)
    if ENABLE_INSTAGRAM:
        try:
            instagram_collect_all(conn)
        except Exception as e:
            print("Instagram coletor falhou (ignorado):", e)
    conn.close()

if __name__ == "__main__":
    main()
