vtube-studio/hooks/useFaceTracking.ts
James Twose b6017794a5 feat: Initialize Gemini V-Studio project setup
Sets up the foundational project structure, including:
- Vite for build tooling.
- React for the UI.
- Tailwind CSS for styling.
- MediaPipe for face tracking capabilities.
- Gemini API integration for avatar generation.
- Basic configuration files (package.json, vite.config.ts, tsconfig.json).
- Initial README with local run instructions.
- Core types and a basic Gemini service for image generation.
2025-11-20 20:45:25 +01:00

140 lines
4.6 KiB
TypeScript

import { useEffect, useRef, useState, useCallback } from 'react';
import { FaceLandmarker, FilesetResolver, DrawingUtils } from '@mediapipe/tasks-vision';
import { TrackingData } from '../types';
export const useFaceTracking = (videoElement: HTMLVideoElement | null) => {
const [isTracking, setIsTracking] = useState(false);
const [isLoading, setIsLoading] = useState(true);
const faceLandmarkerRef = useRef<FaceLandmarker | null>(null);
const requestRef = useRef<number | null>(null);
const lastVideoTimeRef = useRef<number>(-1);
const [trackingData, setTrackingData] = useState<TrackingData>({
rotationX: 0,
rotationY: 0,
rotationZ: 0,
translationX: 0,
translationY: 0,
mouthOpen: 0,
isBlinkingLeft: false,
isBlinkingRight: false,
});
// Initialize FaceLandmarker
useEffect(() => {
const initMediaPipe = async () => {
try {
// Use specific version to match index.html import and prevent version mismatch
const filesetResolver = await FilesetResolver.forVisionTasks(
"https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/wasm"
);
faceLandmarkerRef.current = await FaceLandmarker.createFromOptions(filesetResolver, {
baseOptions: {
modelAssetPath: `https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task`,
delegate: "GPU"
},
outputFaceBlendshapes: true,
outputFacialTransformationMatrixes: true,
runningMode: "VIDEO",
numFaces: 1
});
setIsLoading(false);
} catch (error) {
console.error("Failed to load MediaPipe:", error);
setIsLoading(false);
}
};
initMediaPipe();
return () => {
faceLandmarkerRef.current?.close();
};
}, []);
const predict = useCallback(() => {
if (!faceLandmarkerRef.current || !videoElement) return;
// Only predict if video is ready and playing
if (videoElement.readyState < 2) return;
const nowInMs = Date.now();
if (lastVideoTimeRef.current !== videoElement.currentTime) {
lastVideoTimeRef.current = videoElement.currentTime;
const results = faceLandmarkerRef.current.detectForVideo(videoElement, nowInMs);
if (results.faceLandmarks && results.faceLandmarks.length > 0) {
// 1. Extract Blendshapes for Expression
const blendshapes = results.faceBlendshapes?.[0]?.categories;
let mouthOpen = 0;
let eyeBlinkLeft = 0;
let eyeBlinkRight = 0;
if (blendshapes) {
mouthOpen = blendshapes.find(c => c.categoryName === 'jawOpen')?.score || 0;
eyeBlinkLeft = blendshapes.find(c => c.categoryName === 'eyeBlinkLeft')?.score || 0;
eyeBlinkRight = blendshapes.find(c => c.categoryName === 'eyeBlinkRight')?.score || 0;
}
// 2. Estimate Pose (simplified)
// MediaPipe gives a matrix, but often for 2D avatars, simple landmark delta is cleaner.
// We use specific landmarks to calculate roll, yaw, pitch approximation.
const landmarks = results.faceLandmarks[0];
// Roll: Angle between eyes
const leftEye = landmarks[33]; // Outer left eye
const rightEye = landmarks[263]; // Outer right eye
const dy = rightEye.y - leftEye.y;
const dx = rightEye.x - leftEye.x;
const roll = Math.atan2(dy, dx);
// Yaw: Nose offset from center of eyes
const nose = landmarks[1];
const midPointX = (leftEye.x + rightEye.x) / 2;
const yaw = (nose.x - midPointX) * 2; // sensitivity
// Pitch: Nose offset vertical
const midPointY = (leftEye.y + rightEye.y) / 2;
const pitch = (nose.y - midPointY) * 2;
// Translation
const transX = (nose.x - 0.5) * 2;
const transY = (nose.y - 0.5) * 2;
setTrackingData({
rotationZ: roll,
rotationY: yaw,
rotationX: pitch,
translationX: transX,
translationY: transY,
mouthOpen,
isBlinkingLeft: eyeBlinkLeft > 0.5,
isBlinkingRight: eyeBlinkRight > 0.5
});
}
}
requestRef.current = requestAnimationFrame(predict);
}, [videoElement]);
const startTracking = useCallback(() => {
setIsTracking(true);
requestRef.current = requestAnimationFrame(predict);
}, [predict]);
const stopTracking = useCallback(() => {
setIsTracking(false);
if (requestRef.current) {
cancelAnimationFrame(requestRef.current);
}
}, []);
return {
isLoading,
trackingData,
startTracking,
stopTracking
};
};