vtube-studio/hooks/useFaceTracking.ts

import { useEffect, useRef, useState, useCallback } from 'react';
import { FaceLandmarker, FilesetResolver, DrawingUtils } from '@mediapipe/tasks-vision';
import { TrackingData } from '../types';

export const useFaceTracking = (videoElement: HTMLVideoElement | null) => {
  const [isTracking, setIsTracking] = useState(false);
  const [isLoading, setIsLoading] = useState(true);
  const faceLandmarkerRef = useRef<FaceLandmarker | null>(null);
  const requestRef = useRef<number | null>(null);
  const lastVideoTimeRef = useRef<number>(-1);
  const [trackingData, setTrackingData] = useState<TrackingData>({
    rotationX: 0,
    rotationY: 0,
    rotationZ: 0,
    translationX: 0,
    translationY: 0,
    mouthOpen: 0,
    isBlinkingLeft: false,
    isBlinkingRight: false,
  });

  // Initialize FaceLandmarker
  useEffect(() => {
    const initMediaPipe = async () => {
      try {
        // Use specific version to match index.html import and prevent version mismatch
        const filesetResolver = await FilesetResolver.forVisionTasks(
          "https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/wasm"
        );

        faceLandmarkerRef.current = await FaceLandmarker.createFromOptions(filesetResolver, {
          baseOptions: {
            modelAssetPath: `https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task`,
            delegate: "GPU"
          },
          outputFaceBlendshapes: true,
          outputFacialTransformationMatrixes: true,
          runningMode: "VIDEO",
          numFaces: 1
        });

        setIsLoading(false);
      } catch (error) {
        console.error("Failed to load MediaPipe:", error);
        setIsLoading(false);
      }
    };

    initMediaPipe();

    return () => {
      faceLandmarkerRef.current?.close();
    };
  }, []);

  const predict = useCallback(() => {
    if (!faceLandmarkerRef.current || !videoElement) return;

    // Only predict if video is ready and playing
    if (videoElement.readyState < 2) return;

    const nowInMs = Date.now();
    if (lastVideoTimeRef.current !== videoElement.currentTime) {
      lastVideoTimeRef.current = videoElement.currentTime;

      const results = faceLandmarkerRef.current.detectForVideo(videoElement, nowInMs);

      if (results.faceLandmarks && results.faceLandmarks.length > 0) {
        // 1. Extract Blendshapes for Expression
        const blendshapes = results.faceBlendshapes?.[0]?.categories;

        let mouthOpen = 0;
        let eyeBlinkLeft = 0;
        let eyeBlinkRight = 0;

        if (blendshapes) {
          mouthOpen = blendshapes.find(c => c.categoryName === 'jawOpen')?.score || 0;
          eyeBlinkLeft = blendshapes.find(c => c.categoryName === 'eyeBlinkLeft')?.score || 0;
          eyeBlinkRight = blendshapes.find(c => c.categoryName === 'eyeBlinkRight')?.score || 0;
        }

        // 2. Estimate Pose (simplified)
        // MediaPipe gives a matrix, but often for 2D avatars, simple landmark delta is cleaner.
        // We use specific landmarks to calculate roll, yaw, pitch approximation.
        const landmarks = results.faceLandmarks[0];

        // Roll: Angle between eyes
        const leftEye = landmarks[33]; // Outer left eye
        const rightEye = landmarks[263]; // Outer right eye
        const dy = rightEye.y - leftEye.y;
        const dx = rightEye.x - leftEye.x;
        const roll = Math.atan2(dy, dx);

        // Yaw: Nose offset from center of eyes
        const nose = landmarks[1];
        const midPointX = (leftEye.x + rightEye.x) / 2;
        const yaw = (nose.x - midPointX) * 2; // sensitivity

        // Pitch: Nose offset vertical
        const midPointY = (leftEye.y + rightEye.y) / 2;
        const pitch = (nose.y - midPointY) * 2;

        // Translation
        const transX = (nose.x - 0.5) * 2;
        const transY = (nose.y - 0.5) * 2;

        setTrackingData({
          rotationZ: roll,
          rotationY: yaw,
          rotationX: pitch,
          translationX: transX,
          translationY: transY,
          mouthOpen,
          isBlinkingLeft: eyeBlinkLeft > 0.5,
          isBlinkingRight: eyeBlinkRight > 0.5
        });
      }
    }
    requestRef.current = requestAnimationFrame(predict);
  }, [videoElement]);

  const startTracking = useCallback(() => {
    setIsTracking(true);
    requestRef.current = requestAnimationFrame(predict);
  }, [predict]);

  const stopTracking = useCallback(() => {
    setIsTracking(false);
    if (requestRef.current) {
      cancelAnimationFrame(requestRef.current);
    }
  }, []);

  return {
    isLoading,
    trackingData,
    startTracking,
    stopTracking
  };
};