import logging
import re
import urllib.parse
import urllib.request
from pathlib import Path

logging.basicConfig(level=logging.INFO)


def _fetch_hf_html(repo_id: str, folder_path: str) -> str:
    """Fetch HTML from HuggingFace tree page."""
    url = f"https://huggingface.co/datasets/{repo_id}/tree/main/{folder_path}"
    with urllib.request.urlopen(url) as response:
        return response.read().decode("utf-8")


def list_hf_subfolders(repo_id: str, folder_path: str) -> list[str]:
    """List subfolders in a HuggingFace dataset folder."""
    try:
        html = _fetch_hf_html(repo_id, folder_path)
        pattern = rf'/datasets/{repo_id}/tree/main/({folder_path}/[^"/?]+)'
        return sorted(set(re.findall(pattern, html)))
    except Exception as e:
        logging.error(f"Failed to list subfolders in {folder_path}: {e}")
        return []


def list_hf_files(
    repo_id: str,
    folder_path: str,
    extensions: tuple = (".jpg", ".jpeg", ".png", ".webp"),
) -> list[str]:
    """List image files in a HuggingFace dataset folder."""
    try:
        html = _fetch_hf_html(repo_id, folder_path)
        pattern = rf'/datasets/{repo_id}/blob/main/({folder_path}/[^"]+?({"|".join(e for e in extensions)}))'
        return [urllib.parse.unquote(match[0]) for match in re.findall(pattern, html)]
    except Exception as e:
        logging.error(f"Failed to list files in {folder_path}: {e}")
        return []


def download_test_images(save_dir: Path, repo_folder: str, repo_id: str) -> Path:
    """Download the test_images/ folder from the HF test dataset repo"""
    # Discover all subfolders and collect files
    subfolders = list_hf_subfolders(repo_id, repo_folder)
    if not subfolders:
        logging.warning(f"No subfolders found in {repo_folder}")
        return save_dir

    all_files = [f for folder in subfolders for f in list_hf_files(repo_id, folder)]
    if not all_files:
        logging.warning(f"No image files found in {repo_folder}")
        return save_dir

    logging.info(f"Found {len(all_files)} files from {len(subfolders)} folders")
    # Download files, preserving folder structure
    save_dir_path = Path(save_dir)
    downloaded = 0
    skipped = 0
    for file_path in all_files:
        relative_path = Path(file_path).relative_to(repo_folder)
        save_path = save_dir_path / relative_path
        if save_path.exists():
            logging.info(f"Skipping {relative_path} (already exists)")
            skipped += 1
            continue

        save_path.parent.mkdir(parents=True, exist_ok=True)
        url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{file_path}"
        logging.info(f"Downloading {relative_path}...")
        urllib.request.urlretrieve(url, save_path)
        downloaded += 1

    logging.info(f"Downloaded {downloaded} files, skipped {skipped} existing files")
    return save_dir_path


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    download_test_images(
        repo_id="ssitu/ultimatesdupscale_test",
        save_dir=Path("./test/test_images/"),
        repo_folder="test_images",
    )