#!/usr/bin/env python3
#
# @File: download-web-candidates.py
# @Date: 2026-06-02
#
# Downloads diverse outdoor car images from Bing image search (no API key needed).
# Designed to produce a dataset with:
#   - Exterior full-car shots (not interior, not close-ups)
#   - Natural outdoor backgrounds
#   - Background visible through windows/windshield
#
# After this, run filter-candidates.py to score and select the best images.
#
# Usage:
#   python3 tools/internal/download-web-candidates.py
#   python3 tools/internal/download-web-candidates.py --out candidates/web --per-query 40

import argparse
import shutil
import hashlib
from pathlib import Path
from icrawler.builtin import BingImageCrawler

# Queries target individual car model exterior photos — these naturally return
# single-car shots, not parking lot aerials or multi-car lot images.
QUERIES = [
    "Toyota Camry parked dealership lot outdoor",
    "Honda Accord used car parked outdoor",
    "Toyota RAV4 parked street outdoor daylight",
    "Honda CR-V used parked outdoor lot",
    "Ford F-150 pickup parked outdoor driveway",
    "Chevrolet Silverado parked outdoor daylight",
    "Toyota Corolla parked outdoor lot daylight",
    "Honda Civic parked outdoor street",
    "Nissan Altima parked outdoor lot",
    "Hyundai Elantra parked outdoor daylight",
    "Jeep Grand Cherokee parked outdoor",
    "Ford Explorer parked outdoor lot",
    "BMW 3 series parked outdoor street",
    "Mercedes C-class parked outdoor daylight",
    "Chevrolet Equinox parked outdoor lot",
    "Toyota Highlander parked outdoor street",
    "Subaru Outback parked outdoor mountain",
    "Volkswagen Jetta parked outdoor lot",
    "Kia Sportage parked outdoor daylight",
    "Mazda CX-5 parked outdoor lot",
    "Dodge Charger parked outdoor lot",
    "Audi A4 parked outdoor street",
    "Jeep Wrangler parked outdoor",
    "Ford Mustang parked outdoor lot",
    "GMC Sierra parked outdoor daylight",
    "Hyundai Tucson parked outdoor lot",
    "Subaru Forester parked outdoor daylight",
    "Nissan Rogue parked outdoor lot",
    "Chevrolet Malibu parked outdoor",
    "Dodge Ram parked outdoor daylight",
]


def dedup_and_rename(src_dir: Path, dest_dir: Path, start_idx: int) -> int:
    """
    Copy images from src_dir into dest_dir with sequential names, skipping duplicates.
    Returns the next available index.
    """
    seen_hashes = set()
    # Load hashes of already-existing files in dest_dir
    for existing in dest_dir.glob("*.jpg"):
        seen_hashes.add(hashlib.md5(existing.read_bytes()).hexdigest())

    idx = start_idx
    for img_path in sorted(src_dir.glob("*")):
        if img_path.suffix.lower() not in {".jpg", ".jpeg", ".png", ".webp"}:
            continue
        data = img_path.read_bytes()
        h    = hashlib.md5(data).hexdigest()
        if h in seen_hashes:
            continue
        seen_hashes.add(h)
        dest = dest_dir / f"{idx:04d}.jpg"

        # Convert non-JPEG to JPEG
        if img_path.suffix.lower() != ".jpg":
            try:
                from PIL import Image
                Image.open(img_path).convert("RGB").save(dest, "JPEG", quality=92)
            except Exception:
                continue
        else:
            shutil.copy2(img_path, dest)
        idx += 1

    return idx


def main():
    parser = argparse.ArgumentParser(description="Download diverse outdoor car images from web")
    parser.add_argument("--out",       default="candidates/web", help="Output folder (default: candidates/web)")
    parser.add_argument("--per-query", type=int, default=40,    help="Images to download per search query (default: 40)")
    parser.add_argument("--queries",   nargs="+",               help="Override default search queries")
    args = parser.parse_args()

    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)

    queries = args.queries if args.queries else QUERIES

    existing = len(list(out_dir.glob("*.jpg")))
    if existing:
        print(f"Resuming — {existing} images already in {out_dir}/\n")

    next_idx = existing + 1
    tmp_base = Path("/tmp/_web_crawl_tmp")

    for i, query in enumerate(queries, 1):
        tmp_dir = tmp_base / f"q{i:02d}"
        tmp_dir.mkdir(parents=True, exist_ok=True)

        print(f"[{i:2d}/{len(queries)}] '{query}' → downloading up to {args.per_query} images ...")
        try:
            crawler = BingImageCrawler(storage={"root_dir": str(tmp_dir)}, feeder_threads=2, parser_threads=2, downloader_threads=4)
            crawler.crawl(
                keyword    = query,
                max_num    = args.per_query,
                min_size   = (400, 300),
                file_idx_offset = 0,
            )
        except Exception as e:
            print(f"    Crawl error: {e}")
            continue

        before = next_idx
        next_idx = dedup_and_rename(tmp_dir, out_dir, next_idx)
        added = next_idx - before
        print(f"    Added {added} new images  (total so far: {next_idx - 1})")

        # Clean up temp folder for this query
        shutil.rmtree(tmp_dir, ignore_errors=True)

    total = len(list(out_dir.glob("*.jpg")))
    print(f"\n{'='*55}")
    print(f"Downloaded : {total} unique images")
    print(f"Location   : {out_dir}/")
    print(f"\nNext step — filter for exterior outdoor shots with visible windows:")
    print(f"  python3 tools/internal/filter-candidates.py --source {out_dir} --threshold 0.65")
    print("=" * 55)


if __name__ == "__main__":
    main()
