#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, re, hashlib, time, json
import requests
import pymysql
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

DB = dict(host="127.0.0.1", user="root", password="152535ff", database="facial", charset="utf8mb4")

BASE_DIR = "/var/www/html/facial/fugitivos/seape_df"
SOURCE_SLUG = "seape_df"
SEAPE_URL = "https://seape.df.gov.br/foragidos/"

def db():
    return pymysql.connect(**DB, cursorclass=pymysql.cursors.DictCursor, autocommit=False)

def get_or_create_source(conn):
    with conn.cursor() as cur:
        cur.execute("SELECT id FROM fugitivos_fontes WHERE slug=%s", (SOURCE_SLUG,))
        r = cur.fetchone()
        if r: return r["id"]
        cur.execute("INSERT INTO fugitivos_fontes (slug,nome,url) VALUES (%s,%s,%s)",
                    (SOURCE_SLUG,"SEAPE-DF Foragidos",SEAPE_URL))
        conn.commit()
        return cur.lastrowid

def norm_txt(t): 
    return re.sub(r'\s+',' ', (t or '').strip())

def md5_bytes(b): return hashlib.md5(b).hexdigest()

def fetch_seape_html():
    r = requests.get(SEAPE_URL, timeout=20)
    r.raise_for_status()
    return r.text

def parse_cards(html):
    """
    A página exibe cards com nome, às vezes com 'Prontuário/Cidade/ocorrência' e imagens.
    Estratégia robusta: pegar todo <img> significativo + textos próximos.
    """
    soup = BeautifulSoup(html, "lxml")
    cards = []

    # pega blocos com figuras (wp-block-image) e blocos de parágrafo
    imgs = soup.select("img")
    for img in imgs:
        src = img.get("src") or img.get("data-src") or ""
        alt = norm_txt(img.get("alt") or "")
        if not src or "wp-content/uploads" not in src:
            continue
        # heurística: alt em maiúsculas costuma ser nome
        nome = alt if len(alt.split())>=2 else None

        # sobe no DOM tentando capturar textos próximos
        txt = []
        parent = img
        for _ in range(3):
            parent = parent.parent
            if not parent: break
            txt.extend([norm_txt(t.get_text(" ")) for t in parent.find_all(["p","figcaption"])])
        blob = norm_txt(" | ".join([t for t in txt if t]))

        cards.append({"nome":nome, "blob":blob, "image_url":src})

    # fallback: varrer <p> com padrão "NOME\nProntuário:"
    paras = soup.select("p")
    for p in paras:
        txt = norm_txt(p.get_text(" "))
        if "Prontuário" in txt or "SAIDÃO" in txt.upper():
            # tenta extrair NOME no início
            m = re.match(r"([A-ZÁÂÃÉÊÍÓÔÕÚÇ ]{6,})", txt)
            if m:
                nome = norm_txt(m.group(1)).title()
                cards.append({"nome":nome, "blob":txt, "image_url":None})

    # normaliza nomes Title Case
    final = []
    for c in cards:
        nome = (c["nome"] or "").strip()
        if nome and nome.upper()==nome:
            nome = nome.title()
        c["nome"] = nome
        final.append(c)
    return final

def save_image(url):
    os.makedirs(BASE_DIR, exist_ok=True)
    b = requests.get(url, timeout=25).content
    h = md5_bytes(b)
    ext = os.path.splitext(urlparse(url).path)[1] or ".jpg"
    fn = f"{h}{ext.lower()}"
    full = os.path.join(BASE_DIR, fn)
    with open(full,"wb") as f: f.write(b)
    return h, f"/var/www/html/facial/fugitivos/seape_df/{fn}"

def upsert_fugitivo(conn, fonte_id, item):
    nome = norm_txt(item.get("nome") or "")
    crimes = norm_txt(item.get("blob") or "")
    img_url = item.get("image_url")
    img_md5, img_local = None, None
    if img_url:
        try:
            img_md5, img_local = save_image(img_url)
        except Exception:
            img_md5, img_local = None, None

    with conn.cursor() as cur:
        cur.execute("""
            SELECT id FROM fugitivos_pessoas 
            WHERE fonte_id=%s AND nome=%s AND (image_md5<=>%s)
        """, (fonte_id, nome, img_md5))
        r = cur.fetchone()
        if r:
            cur.execute("""UPDATE fugitivos_pessoas 
                           SET crimes=%s, url_origem=%s, image_url=%s, image_local=%s, updated_at=NOW()
                           WHERE id=%s""", (crimes, SEAPE_URL, img_url, img_local, r["id"]))
            return r["id"], False
        cur.execute("""INSERT INTO fugitivos_pessoas
            (fonte_id,nome,crimes,url_origem,image_url,image_local,image_md5,status)
            VALUES (%s,%s,%s,%s,%s,%s,%s,'novo')""",
            (fonte_id, nome, crimes, SEAPE_URL, img_url, img_local, img_md5))
        conn.commit()
        return cur.lastrowid, True

def main():
    conn = db()
    fonte_id = get_or_create_source(conn)
    html = fetch_seape_html()
    items = parse_cards(html)
    created = updated = 0
    for it in items:
        try:
            _id, is_new = upsert_fugitivo(conn, fonte_id, it)
            created += 1 if is_new else 0
            updated += 0 if is_new else 1
        except Exception as e:
            print("ERR item:", it.get("nome"), e)
            continue
    conn.commit()
    print(f"SEAPE: {created} novos, {updated} atualizados")
    conn.close()

if __name__ == "__main__":
    main()
