face_detector.py

import io
import os
import traceback
from typing import Dict, List, Union

import numpy as np
import torch
from PIL import Image, ImageDraw
from loguru import logger
from ultralytics import YOLO

# Try to import transformers for Grounding DINO
try:
    from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor

    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    logger.warning("transformers not available, Grounding DINO method will not work")


class FaceDetector:
    """
    Face detection using multiple methods

    Supports three detection methods:

    1. YOLO World (method='yolo'):
       - Open-vocabulary detection
       - Supports various face types: human, animal, anime, sketch
       - More flexible but slower
       - Can detect custom classes via text description

    2. Grounding DINO (method='grounding'):
       - Open-vocabulary object detection
       - Supports various face types via text prompts
       - Requires transformers library
       - Good balance between accuracy and flexibility
    """

    def __init__(
        self,
        method: str = "yolo",
        model_path: str = None,
        conf_threshold: float = None,
        device: str = None,
        custom_classes: List[str] = None,
        cascade_path: str = None,
    ):
        """
        Initialize face detector

        Args:
            method: Detection method. Options:
                - "yolo": Use YOLO World (supports various face types)
                - "grounding": Use Grounding DINO (requires transformers)
                Default: "yolo"
            model_path: YOLO World model path (only used when method="yolo")
                If None, uses default YOLO World model
            conf_threshold: Confidence threshold (only used when method="yolo")
                If None, uses adaptive threshold based on classes
            device: Device for YOLO ('cpu', 'cuda', '0', '1', etc.), None for auto
            custom_classes: List of custom class names for YOLO World. Default: ["face"]
                Examples: ["face"], ["animal face"], ["human face", "animal face"]
        """

        self.method = method.lower()
        self.device = device

        if self.method == "yolo":
            # Initialize YOLO World detector
            # Set custom classes (default to "face")
            if custom_classes is None:
                custom_classes = ["human face", "animal face", "anime face", "sketch face"]
            self.custom_classes = custom_classes

            # Adaptive confidence threshold based on class specificity
            if conf_threshold is None:
                if len(custom_classes) > 1:
                    # Multiple classes: use lower threshold to catch all detections
                    conf_threshold = 0.1
                elif len(custom_classes) == 1:
                    class_name = custom_classes[0].lower()
                    if "face" in class_name and class_name.strip() == "face":
                        # Generic "face" class: needs higher threshold but not too high
                        conf_threshold = 0.15
                    else:
                        # Specific class like "animal face": can use moderate threshold
                        conf_threshold = 0.15
                else:
                    conf_threshold = 0.25
            self.conf_threshold = conf_threshold

            if model_path is None:
                # Use YOLO World model for open-vocabulary detection
                logger.info("Loading YOLO World model for face detection")
                try:
                    # Try to load YOLO World small model first (lighter and faster)
                    self.model = YOLO("yolov8s-world.pt")
                except Exception as e:
                    logger.warning(f"Failed to load yolov8s-world.pt, trying yolov8m-world.pt: {e}")
                    try:
                        self.model = YOLO("yolov8m-world.pt")
                    except Exception as e2:
                        logger.warning(f"Failed to load yolov8m-world.pt, trying yolov8l-world.pt: {e2}")
                        self.model = YOLO("yolov8l-world.pt")
                # Set custom classes for YOLO World
                # YOLO World can detect any object described in natural language
                self.model.set_classes(self.custom_classes)
            else:
                logger.info(f"Loading YOLO World model from {model_path}")
                self.model = YOLO(model_path)

            logger.info(f"Face detector initialized with YOLO World, custom classes: {self.custom_classes}, confidence threshold: {self.conf_threshold}")
            self.face_cascade = None

        elif self.method == "grounding":
            # Initialize Grounding DINO detector
            if not TRANSFORMERS_AVAILABLE:
                raise ImportError("transformers library is required for Grounding DINO. Install it with: pip install transformers torch")

            # Set up proxy for HuggingFace model download
            # Check if proxy is already set, if not try to use common proxy settings
            if not os.environ.get("HTTP_PROXY") and not os.environ.get("http_proxy"):
                # Try to use HTTPS_PROXY for HTTP requests as well if available
                https_proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("https_proxy")
                if https_proxy:
                    os.environ["HTTP_PROXY"] = https_proxy
                    os.environ["http_proxy"] = https_proxy
                    logger.info(f"Using proxy from HTTPS_PROXY: {https_proxy}")

            # Log proxy settings
            http_proxy = os.environ.get("HTTP_PROXY") or os.environ.get("http_proxy")
            https_proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("https_proxy")
            if http_proxy or https_proxy:
                logger.info(f"Using proxy - HTTP: {http_proxy}, HTTPS: {https_proxy}")

            # Set custom classes (default to "face")
            if custom_classes is None:
                custom_classes = ["human face", "animal face", "anime face", "sketch face"]
            self.custom_classes = custom_classes

            # Adaptive confidence threshold
            if conf_threshold is None:
                if len(custom_classes) > 1:
                    conf_threshold = 0.1
                else:
                    conf_threshold = 0.3  # Grounding DINO typically needs higher threshold
            self.conf_threshold = conf_threshold

            # Load Grounding DINO model
            model_id = "IDEA-Research/grounding-dino-base"  # or "grounding-dino-tiny" for faster inference
            if model_path is not None:
                model_id = model_path
            logger.info(f"Loading Grounding DINO model: {model_id}")
            try:
                # Grounding DINO requires trust_remote_code=True
                self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
                self.model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id, trust_remote_code=True)
                if device:
                    self.model = self.model.to(device)
                logger.info(f"Face detector initialized with Grounding DINO, custom classes: {self.custom_classes}, confidence threshold: {self.conf_threshold}")
            except Exception as e:
                error_msg = str(e)
                if "connection" in error_msg.lower() or "proxy" in error_msg.lower() or "network" in error_msg.lower():
                    logger.error(f"Failed to download model. Please check your network connection and proxy settings.")
                    logger.error(f"Current proxy settings - HTTP_PROXY: {http_proxy}, HTTPS_PROXY: {https_proxy}")
                    logger.error("You can set proxy with: export http_proxy=... && export https_proxy=...")
                raise
            self.face_cascade = None

        else:
            raise ValueError(f"Unknown method: {method}. Must be 'yolo', or 'grounding'")

    def detect_faces(
        self,
        image: Union[str, Image.Image, bytes, np.ndarray],
        return_image: bool = False,
    ) -> Dict:
        """
        Detect faces in image

        Args:
            image: Input image, can be path, PIL Image, bytes or numpy array
            return_image: Whether to return annotated image with detection boxes

        Returns:
            Dict containing:
                - faces: List of face detection results, each containing:
                    - bbox: [x1, y1, x2, y2] bounding box coordinates (absolute pixel coordinates)
                    - confidence: Confidence score (0.0-1.0)
                    - class_id: Class ID
                    - class_name: Class name
                    - face_type: Type of face detected
                - image (optional): PIL Image with detection boxes drawn (if return_image=True)
        """
        try:
            if self.method == "grounding":
                return self._detect_faces_grounding(image, return_image)
            elif self.method == "yolo":
                return self._detect_faces_yolo(image, return_image)
        except Exception as e:
            logger.error(f"Face detection failed: {traceback.format_exc()}")
            raise RuntimeError(f"Face detection error: {e}")

    def _detect_faces_yolo(
        self,
        image: Union[str, Image.Image, bytes, np.ndarray],
        return_image: bool = False,
    ) -> Dict:
        """Detect faces using YOLO World"""
        # Load image
        if isinstance(image, str):
            img = Image.open(image).convert("RGB")
        elif isinstance(image, bytes):
            img = Image.open(io.BytesIO(image)).convert("RGB")
        elif isinstance(image, np.ndarray):
            img = Image.fromarray(image).convert("RGB")
        elif isinstance(image, Image.Image):
            img = image.convert("RGB")
        else:
            raise ValueError(f"Unsupported image type: {type(image)}")

        # Use YOLO World for open-vocabulary detection
        # YOLO World detects objects based on the custom classes set via set_classes()
        results = self.model.predict(
            source=img,
            conf=self.conf_threshold,
            device=self.device,
            verbose=False,
        )

        faces = []
        annotated_img = img.copy() if return_image else None

        if len(results) > 0:
            result = results[0]
            boxes = result.boxes

            if boxes is not None and len(boxes) > 0:
                for i in range(len(boxes)):
                    # Get bounding box coordinates (xyxy format)
                    bbox = boxes.xyxy[i].cpu().numpy().tolist()
                    confidence = float(boxes.conf[i].cpu().numpy())
                    class_id = int(boxes.cls[i].cpu().numpy())

                    # Get class name from custom classes list
                    # YOLO World returns class_id that corresponds to index in custom_classes
                    if class_id < len(self.custom_classes):
                        class_name = self.custom_classes[class_id]
                    else:
                        class_name = result.names.get(class_id, "unknown")

                    # Determine face type based on class name
                    # For "face" class, it can detect all types of faces
                    if class_name.lower() == "face":
                        face_type = "face"  # Generic face type (can be human, animal, anime, etc.)
                    elif any(keyword in class_name.lower() for keyword in ["human", "person"]):
                        face_type = "human"
                    elif any(keyword in class_name.lower() for keyword in ["animal", "cat", "dog", "bird", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe"]):
                        face_type = "animal"
                    elif any(keyword in class_name.lower() for keyword in ["anime", "cartoon", "manga"]):
                        face_type = "anime"
                    elif any(keyword in class_name.lower() for keyword in ["sketch", "line", "drawing"]):
                        face_type = "sketch"
                    else:
                        logger.debug(f"Dropped unused detected result: {class_name}")
                        face_type = None

                    face_info = {
                        "bbox": bbox,  # [x1, y1, x2, y2] - absolute pixel coordinates
                        "confidence": confidence,
                        "class_id": class_id,
                        "class_name": class_name,
                        "face_type": face_type,
                    }
                    if face_type is not None:
                        faces.append(face_info)

                    # Draw annotations on image if needed
                    if return_image and annotated_img is not None:
                        draw = ImageDraw.Draw(annotated_img)
                        x1, y1, x2, y2 = bbox
                        # Draw bounding box
                        draw.rectangle(
                            [x1, y1, x2, y2],
                            outline="red",
                            width=2,
                        )
                        # Draw label
                        label = f"{class_name} {confidence:.2f}"
                        draw.text((x1, y1 - 15), label, fill="red")

        result_dict = {"faces": faces}

        if return_image and annotated_img is not None:
            result_dict["image"] = annotated_img

        logger.info(f"Detected {len(faces)} faces using YOLO World")
        return result_dict

    def _calculate_iou(self, bbox1: List[float], bbox2: List[float]) -> float:
        """
        Calculate Intersection over Union (IoU) between two bounding boxes

        Args:
            bbox1: [x1, y1, x2, y2] format
            bbox2: [x1, y1, x2, y2] format

        Returns:
            IoU value between 0 and 1
        """
        x1_1, y1_1, x2_1, y2_1 = bbox1
        x1_2, y1_2, x2_2, y2_2 = bbox2

        # Calculate intersection area
        inter_x1 = max(x1_1, x1_2)
        inter_y1 = max(y1_1, y1_2)
        inter_x2 = min(x2_1, x2_2)
        inter_y2 = min(y2_1, y2_2)

        if inter_x2 <= inter_x1 or inter_y2 <= inter_y1:
            return 0.0

        inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)

        # Calculate union area
        area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
        area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
        union_area = area1 + area2 - inter_area

        if union_area == 0:
            return 0.0

        return inter_area / union_area

    def _calculate_bbox_area(self, bbox: List[float]) -> float:
        """Calculate the area of a bounding box"""
        x1, y1, x2, y2 = bbox
        return (x2 - x1) * (y2 - y1)

    def _calculate_containment(self, bbox_small: List[float], bbox_large: List[float]) -> float:
        """
        Calculate how much of bbox_small is contained in bbox_large
        Returns the ratio of intersection area to bbox_small area
        """
        x1_s, y1_s, x2_s, y2_s = bbox_small
        x1_l, y1_l, x2_l, y2_l = bbox_large

        # Calculate intersection
        inter_x1 = max(x1_s, x1_l)
        inter_y1 = max(y1_s, y1_l)
        inter_x2 = min(x2_s, x2_l)
        inter_y2 = min(y2_s, y2_l)

        if inter_x2 <= inter_x1 or inter_y2 <= inter_y1:
            return 0.0

        inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
        small_area = (x2_s - x1_s) * (y2_s - y1_s)

        if small_area == 0:
            return 0.0

        return inter_area / small_area

    def _apply_nms(self, faces: List[Dict], iou_threshold: float = 0.4, containment_threshold: float = 0.6) -> List[Dict]:
        """
        Apply Non-Maximum Suppression (NMS) to remove duplicate detections.
        When detections overlap, keeps the one with larger area (preferring whole objects over parts).

        Args:
            faces: List of face detection dictionaries
            iou_threshold: IoU threshold for considering detections as duplicates
            containment_threshold: If a smaller box is contained in a larger box by this ratio, suppress it

        Returns:
            Filtered list of faces with duplicates removed
        """
        if len(faces) == 0:
            return faces

        # Sort by area (largest first), then by confidence as tie-breaker
        # This ensures we keep the larger detection (whole object) over smaller ones (parts)
        for face in faces:
            face["_area"] = self._calculate_bbox_area(face["bbox"])

        sorted_faces = sorted(faces, key=lambda x: (x["_area"], x["confidence"]), reverse=True)

        keep = []
        suppressed = set()

        for i, face in enumerate(sorted_faces):
            if i in suppressed:
                continue

            keep.append(face)
            bbox_i = face["bbox"]
            area_i = face["_area"]

            # Suppress overlapping detections (prefer larger area)
            for j in range(i + 1, len(sorted_faces)):
                if j in suppressed:
                    continue

                bbox_j = sorted_faces[j]["bbox"]
                area_j = sorted_faces[j]["_area"]

                # Check IoU overlap
                iou = self._calculate_iou(bbox_i, bbox_j)
                if iou > iou_threshold:
                    # If overlapping, suppress the smaller one
                    suppressed.add(j)
                    continue

                # Check if smaller box is mostly contained in larger box
                # (e.g., face is contained in whole animal body)
                # Since we sorted by area, area_i >= area_j for j > i
                if area_j < area_i:
                    containment = self._calculate_containment(bbox_j, bbox_i)
                    if containment > containment_threshold:
                        suppressed.add(j)

        # Clean up temporary area field
        for face in keep:
            face.pop("_area", None)

        logger.info(f"NMS filtered {len(faces)} detections to {len(keep)} (IoU threshold: {iou_threshold}, containment threshold: {containment_threshold}, prefer larger area)")
        return keep

    def _detect_faces_grounding(
        self,
        image: Union[str, Image.Image, bytes, np.ndarray],
        return_image: bool = False,
    ) -> Dict:
        """Detect faces using Grounding DINO"""
        # Load image
        if isinstance(image, str):
            img = Image.open(image).convert("RGB")
        elif isinstance(image, bytes):
            img = Image.open(io.BytesIO(image)).convert("RGB")
        elif isinstance(image, np.ndarray):
            img = Image.fromarray(image).convert("RGB")
        elif isinstance(image, Image.Image):
            img = image.convert("RGB")
        else:
            raise ValueError(f"Unsupported image type: {type(image)}")

        # Prepare text prompt - join custom classes with ". " separator
        text_prompt = ". ".join(self.custom_classes)
        if not text_prompt.endswith("."):
            text_prompt += "."

        # Process image and text
        inputs = self.processor(images=img, text=text_prompt, return_tensors="pt")
        if self.device:
            inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        # Run inference
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Post-process results
        # Note: Grounding DINO uses 'threshold' instead of 'box_threshold'
        results = self.processor.post_process_grounded_object_detection(
            outputs,
            input_ids=inputs["input_ids"],
            threshold=self.conf_threshold,
            text_threshold=self.conf_threshold,
            target_sizes=[img.size[::-1]],  # [height, width]
        )

        faces = []
        annotated_img = img.copy() if return_image else None

        if len(results) > 0:
            result = results[0]

            # Get detections
            # Use text_labels instead of labels to avoid FutureWarning
            boxes = result.get("boxes", [])
            text_labels = result.get("text_labels", [])
            # Fallback to labels if text_labels not available
            if not text_labels:
                labels = result.get("labels", [])
                # Convert label IDs to class names if needed
                text_labels = [self.custom_classes[label] if isinstance(label, int) and label < len(self.custom_classes) else str(label) for label in labels]
            scores = result.get("scores", [])

            for i, (box, label, score) in enumerate(zip(boxes, text_labels, scores)):
                # Grounding DINO returns boxes as [x1, y1, x2, y2]
                if isinstance(box, torch.Tensor):
                    bbox = box.tolist()
                else:
                    bbox = list(box)
                # Ensure it's [x1, y1, x2, y2] format
                if len(bbox) == 4:
                    bbox = [float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])]
                else:
                    # If it's in center format, convert
                    x_center, y_center, width, height = bbox
                    x1 = x_center - width / 2
                    y1 = y_center - height / 2
                    x2 = x_center + width / 2
                    y2 = y_center + height / 2
                    bbox = [float(x1), float(y1), float(x2), float(y2)]

                # Get class name from label
                # Grounding DINO may return multiple class names concatenated
                class_name_raw = label if isinstance(label, str) else self.custom_classes[0]

                # If class_name contains multiple classes, try to extract the most specific one
                # Priority: specific classes (animal, anime, sketch) > human > generic face
                class_name = class_name_raw
                if isinstance(class_name_raw, str) and len(self.custom_classes) > 1:
                    class_name_lower = class_name_raw.lower()
                    # Check for specific classes first
                    if any(keyword in class_name_lower for keyword in ["animal"]):
                        for c in self.custom_classes:
                            if "animal" in c.lower():
                                class_name = c
                                break
                    elif any(keyword in class_name_lower for keyword in ["anime", "cartoon"]):
                        for c in self.custom_classes:
                            if any(k in c.lower() for k in ["anime", "cartoon"]):
                                class_name = c
                                break
                    elif any(keyword in class_name_lower for keyword in ["sketch", "line", "drawing"]):
                        for c in self.custom_classes:
                            if any(k in c.lower() for k in ["sketch", "line", "drawing"]):
                                class_name = c
                                break
                    elif any(keyword in class_name_lower for keyword in ["human", "person"]):
                        for c in self.custom_classes:
                            if any(k in c.lower() for k in ["human", "person"]):
                                class_name = c
                                break

                # Determine face type based on class name
                if class_name.lower() == "face":
                    face_type = "face"
                elif any(keyword in class_name.lower() for keyword in ["human", "person"]):
                    face_type = "human"
                elif any(keyword in class_name.lower() for keyword in ["animal", "cat", "dog", "bird"]):
                    face_type = "animal"
                elif any(keyword in class_name.lower() for keyword in ["anime", "cartoon", "manga"]):
                    face_type = "anime"
                elif any(keyword in class_name.lower() for keyword in ["sketch", "line", "drawing"]):
                    face_type = "sketch"
                else:
                    face_type = class_name.lower()

                face_info = {
                    "bbox": bbox,
                    "confidence": float(score),
                    "class_id": i,
                    "class_name": class_name,
                    "face_type": face_type,
                }
                faces.append(face_info)

                # Draw annotations if needed
                if return_image and annotated_img is not None:
                    draw = ImageDraw.Draw(annotated_img)
                    x1, y1, x2, y2 = bbox
                    draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
                    label = f"{class_name} {score:.2f}"
                    draw.text((x1, y1 - 15), label, fill="red")

        # Apply NMS to remove duplicate detections (only when multiple classes are specified)
        if len(self.custom_classes) > 1:
            faces = self._apply_nms(faces, iou_threshold=0.4, containment_threshold=0.6)
            # Re-draw annotations after NMS if needed
            if return_image and annotated_img is not None and len(faces) > 0:
                annotated_img = img.copy()
                draw = ImageDraw.Draw(annotated_img)
                for face in faces:
                    x1, y1, x2, y2 = face["bbox"]
                    draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
                    label = f"{face['class_name']} {face['confidence']:.2f}"
                    draw.text((x1, y1 - 15), label, fill="red")

        result_dict = {"faces": faces}
        if return_image and annotated_img is not None:
            result_dict["image"] = annotated_img

        logger.info(f"Detected {len(faces)} faces using Grounding DINO (after NMS)")
        return result_dict

    def detect_faces_from_bytes(self, image_bytes: bytes, **kwargs) -> Dict:
        """
        Detect faces from byte data

        Args:
            image_bytes: Image byte data
            **kwargs: Additional parameters passed to detect_faces

        Returns:
            Detection result dictionary
        """
        return self.detect_faces(image_bytes, **kwargs)

    def extract_face_regions(self, image: Union[str, Image.Image, bytes], expand_ratio: float = 0.1) -> List[Image.Image]:
        """
        Extract detected face regions

        Args:
            image: Input image
            expand_ratio: Bounding box expansion ratio to include more context

        Returns:
            List of extracted face region images
        """
        result = self.detect_faces(image)
        faces = result["faces"]

        # Load original image
        if isinstance(image, str):
            img = Image.open(image).convert("RGB")
        elif isinstance(image, bytes):
            img = Image.open(io.BytesIO(image)).convert("RGB")
        elif isinstance(image, Image.Image):
            img = image.convert("RGB")
        else:
            raise ValueError(f"Unsupported image type: {type(image)}")

        face_regions = []
        img_width, img_height = img.size

        for face in faces:
            x1, y1, x2, y2 = face["bbox"]

            # Expand bounding box
            width = x2 - x1
            height = y2 - y1
            expand_x = width * expand_ratio
            expand_y = height * expand_ratio

            x1 = max(0, int(x1 - expand_x))
            y1 = max(0, int(y1 - expand_y))
            x2 = min(img_width, int(x2 + expand_x))
            y2 = min(img_height, int(y2 + expand_y))

            # Crop region
            face_region = img.crop((x1, y1, x2, y2))
            face_regions.append(face_region)

        return face_regions

    def count_faces(self, image: Union[str, Image.Image, bytes]) -> int:
        """
        Count number of faces in image

        Args:
            image: Input image

        Returns:
            Number of detected faces
        """
        result = self.detect_faces(image, return_image=False)
        return len(result["faces"])


def detect_faces_in_image(
    image_path: str,
    method: str = "grounding",
    model_path: str = None,
    conf_threshold: float = None,
    return_image: bool = False,
    custom_classes: List[str] = None,
) -> Dict:
    """
    Convenience function: detect faces in image

    Args:
        image_path: Image path
        method: Detection method ("yolo", or "grounding"), default "yolo"
        model_path: YOLO World model path (only for method="yolo")
        conf_threshold: Confidence threshold (None for adaptive, only for method="yolo")
        return_image: Whether to return annotated image
        custom_classes: List of custom class names for YOLO (default: ["face"])

    Returns:
        Detection result dictionary containing:
            - faces: List of face detection results with bbox coordinates [x1, y1, x2, y2]
              Each face contains: bbox, confidence, class_id, class_name, face_type
            - image (optional): Annotated image with detection boxes

    Examples:
        # Detect faces using YOLO World with default "face" class
        result = detect_faces_in_image("image.jpg", method="yolo")

        # Detect with YOLO World and custom classes
        result = detect_faces_in_image("image.jpg", method="yolo",
                                      custom_classes=["human face", "animal face"])

        # Detect with Grounding DINO
        result = detect_faces_in_image("image.jpg", method="grounding",
                                      custom_classes=["animal face"])
    """
    detector = FaceDetector(
        method=method,
        model_path=model_path,
        conf_threshold=conf_threshold,
        custom_classes=custom_classes,
    )
    return detector.detect_faces(image_path, return_image=return_image)


if __name__ == "__main__":
    # Test code
    import sys

    if len(sys.argv) < 2:
        print("Usage: python face_detector.py <image_path>")
        sys.exit(1)

    image_path = sys.argv[1]
    detector = FaceDetector()
    result = detector.detect_faces(image_path, return_image=True)

    print(f"Detected {len(result['faces'])} faces:")
    for i, face in enumerate(result["faces"]):
        print(f"  Face {i + 1}: {face}")

    output_path = "detected_faces.png"
    result["image"].save(output_path)
    print(f"Annotated image saved to: {output_path}")