Sets up the foundational project structure, including: - Vite for build tooling. - React for the UI. - Tailwind CSS for styling. - MediaPipe for face tracking capabilities. - Gemini API integration for avatar generation. - Basic configuration files (package.json, vite.config.ts, tsconfig.json). - Initial README with local run instructions. - Core types and a basic Gemini service for image generation.
140 lines
4.6 KiB
TypeScript
140 lines
4.6 KiB
TypeScript
import { useEffect, useRef, useState, useCallback } from 'react';
|
|
import { FaceLandmarker, FilesetResolver, DrawingUtils } from '@mediapipe/tasks-vision';
|
|
import { TrackingData } from '../types';
|
|
|
|
export const useFaceTracking = (videoElement: HTMLVideoElement | null) => {
|
|
const [isTracking, setIsTracking] = useState(false);
|
|
const [isLoading, setIsLoading] = useState(true);
|
|
const faceLandmarkerRef = useRef<FaceLandmarker | null>(null);
|
|
const requestRef = useRef<number | null>(null);
|
|
const lastVideoTimeRef = useRef<number>(-1);
|
|
const [trackingData, setTrackingData] = useState<TrackingData>({
|
|
rotationX: 0,
|
|
rotationY: 0,
|
|
rotationZ: 0,
|
|
translationX: 0,
|
|
translationY: 0,
|
|
mouthOpen: 0,
|
|
isBlinkingLeft: false,
|
|
isBlinkingRight: false,
|
|
});
|
|
|
|
// Initialize FaceLandmarker
|
|
useEffect(() => {
|
|
const initMediaPipe = async () => {
|
|
try {
|
|
// Use specific version to match index.html import and prevent version mismatch
|
|
const filesetResolver = await FilesetResolver.forVisionTasks(
|
|
"https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/wasm"
|
|
);
|
|
|
|
faceLandmarkerRef.current = await FaceLandmarker.createFromOptions(filesetResolver, {
|
|
baseOptions: {
|
|
modelAssetPath: `https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task`,
|
|
delegate: "GPU"
|
|
},
|
|
outputFaceBlendshapes: true,
|
|
outputFacialTransformationMatrixes: true,
|
|
runningMode: "VIDEO",
|
|
numFaces: 1
|
|
});
|
|
|
|
setIsLoading(false);
|
|
} catch (error) {
|
|
console.error("Failed to load MediaPipe:", error);
|
|
setIsLoading(false);
|
|
}
|
|
};
|
|
|
|
initMediaPipe();
|
|
|
|
return () => {
|
|
faceLandmarkerRef.current?.close();
|
|
};
|
|
}, []);
|
|
|
|
const predict = useCallback(() => {
|
|
if (!faceLandmarkerRef.current || !videoElement) return;
|
|
|
|
// Only predict if video is ready and playing
|
|
if (videoElement.readyState < 2) return;
|
|
|
|
const nowInMs = Date.now();
|
|
if (lastVideoTimeRef.current !== videoElement.currentTime) {
|
|
lastVideoTimeRef.current = videoElement.currentTime;
|
|
|
|
const results = faceLandmarkerRef.current.detectForVideo(videoElement, nowInMs);
|
|
|
|
if (results.faceLandmarks && results.faceLandmarks.length > 0) {
|
|
// 1. Extract Blendshapes for Expression
|
|
const blendshapes = results.faceBlendshapes?.[0]?.categories;
|
|
|
|
let mouthOpen = 0;
|
|
let eyeBlinkLeft = 0;
|
|
let eyeBlinkRight = 0;
|
|
|
|
if (blendshapes) {
|
|
mouthOpen = blendshapes.find(c => c.categoryName === 'jawOpen')?.score || 0;
|
|
eyeBlinkLeft = blendshapes.find(c => c.categoryName === 'eyeBlinkLeft')?.score || 0;
|
|
eyeBlinkRight = blendshapes.find(c => c.categoryName === 'eyeBlinkRight')?.score || 0;
|
|
}
|
|
|
|
// 2. Estimate Pose (simplified)
|
|
// MediaPipe gives a matrix, but often for 2D avatars, simple landmark delta is cleaner.
|
|
// We use specific landmarks to calculate roll, yaw, pitch approximation.
|
|
const landmarks = results.faceLandmarks[0];
|
|
|
|
// Roll: Angle between eyes
|
|
const leftEye = landmarks[33]; // Outer left eye
|
|
const rightEye = landmarks[263]; // Outer right eye
|
|
const dy = rightEye.y - leftEye.y;
|
|
const dx = rightEye.x - leftEye.x;
|
|
const roll = Math.atan2(dy, dx);
|
|
|
|
// Yaw: Nose offset from center of eyes
|
|
const nose = landmarks[1];
|
|
const midPointX = (leftEye.x + rightEye.x) / 2;
|
|
const yaw = (nose.x - midPointX) * 2; // sensitivity
|
|
|
|
// Pitch: Nose offset vertical
|
|
const midPointY = (leftEye.y + rightEye.y) / 2;
|
|
const pitch = (nose.y - midPointY) * 2;
|
|
|
|
// Translation
|
|
const transX = (nose.x - 0.5) * 2;
|
|
const transY = (nose.y - 0.5) * 2;
|
|
|
|
setTrackingData({
|
|
rotationZ: roll,
|
|
rotationY: yaw,
|
|
rotationX: pitch,
|
|
translationX: transX,
|
|
translationY: transY,
|
|
mouthOpen,
|
|
isBlinkingLeft: eyeBlinkLeft > 0.5,
|
|
isBlinkingRight: eyeBlinkRight > 0.5
|
|
});
|
|
}
|
|
}
|
|
requestRef.current = requestAnimationFrame(predict);
|
|
}, [videoElement]);
|
|
|
|
const startTracking = useCallback(() => {
|
|
setIsTracking(true);
|
|
requestRef.current = requestAnimationFrame(predict);
|
|
}, [predict]);
|
|
|
|
const stopTracking = useCallback(() => {
|
|
setIsTracking(false);
|
|
if (requestRef.current) {
|
|
cancelAnimationFrame(requestRef.current);
|
|
}
|
|
}, []);
|
|
|
|
return {
|
|
isLoading,
|
|
trackingData,
|
|
startTracking,
|
|
stopTracking
|
|
};
|
|
}; |