vtube-studio/services/visionService.ts
James Twose b6017794a5 feat: Initialize Gemini V-Studio project setup
Sets up the foundational project structure, including:
- Vite for build tooling.
- React for the UI.
- Tailwind CSS for styling.
- MediaPipe for face tracking capabilities.
- Gemini API integration for avatar generation.
- Basic configuration files (package.json, vite.config.ts, tsconfig.json).
- Initial README with local run instructions.
- Core types and a basic Gemini service for image generation.
2025-11-20 20:45:25 +01:00

129 lines
4.5 KiB
TypeScript

import { FaceLandmarker, FilesetResolver } from '@mediapipe/tasks-vision';
import { Rect } from '../types';
let faceLandmarker: FaceLandmarker | null = null;
// Initialize the vision model for static image analysis
const initVision = async () => {
if (faceLandmarker) return;
try {
const filesetResolver = await FilesetResolver.forVisionTasks(
"https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/wasm"
);
faceLandmarker = await FaceLandmarker.createFromOptions(filesetResolver, {
baseOptions: {
modelAssetPath: `https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task`,
delegate: "GPU"
},
runningMode: "IMAGE",
numFaces: 1
});
} catch (e) {
console.error("Failed to initialize vision service:", e);
}
};
export const analyzeAvatarImage = async (imageUrl: string): Promise<{ leftEye: Rect, rightEye: Rect, mouth: Rect, skinColor: string } | null> => {
try {
await initVision();
if (!faceLandmarker) return null;
return new Promise((resolve, reject) => {
const img = new Image();
img.crossOrigin = "anonymous";
img.onload = () => {
try {
const result = faceLandmarker!.detect(img);
if (result.faceLandmarks && result.faceLandmarks.length > 0) {
const landmarks = result.faceLandmarks[0];
// Helper to calculate bounding box from landmark indices
const getRect = (indices: number[]): Rect => {
let minX = 1, minY = 1, maxX = 0, maxY = 0;
indices.forEach(i => {
const l = landmarks[i];
if (l.x < minX) minX = l.x;
if (l.x > maxX) maxX = l.x;
if (l.y < minY) minY = l.y;
if (l.y > maxY) maxY = l.y;
});
const w = maxX - minX;
const h = maxY - minY;
// Expand slightly to cover the area comfortably
const paddingX = w * 0.1;
const paddingY = h * 0.1;
return {
x: minX - paddingX,
y: minY - paddingY,
w: w + (paddingX * 2),
h: h + (paddingY * 2),
};
};
// MediaPipe Mesh Indices
const leftEyeIndices = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246];
const rightEyeIndices = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398];
const mouthIndices = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 375, 321, 405, 314, 17, 84, 181, 91, 146];
const leftRect = getRect(leftEyeIndices);
const rightRect = getRect(rightEyeIndices);
const mouthRect = getRect(mouthIndices);
// Sample Skin Color
const canvas = document.createElement('canvas');
canvas.width = img.width;
canvas.height = img.height;
const ctx = canvas.getContext('2d');
let color = '#fcd3bf'; // Default fallback
if (ctx) {
ctx.drawImage(img, 0, 0);
// Landmark 123 is on the left cheek bone area
const sampleIdx = 123;
const lx = Math.floor(landmarks[sampleIdx].x * img.width);
const ly = Math.floor(landmarks[sampleIdx].y * img.height);
if (lx >= 0 && lx < img.width && ly >= 0 && ly < img.height) {
const pixel = ctx.getImageData(lx, ly, 1, 1).data;
// Convert rgb to hex for input type="color"
const toHex = (c: number) => {
const hex = c.toString(16);
return hex.length === 1 ? "0" + hex : hex;
};
color = `#${toHex(pixel[0])}${toHex(pixel[1])}${toHex(pixel[2])}`;
}
}
resolve({
leftEye: leftRect,
rightEye: rightRect,
mouth: mouthRect,
skinColor: color
});
} else {
console.warn("No face detected in generated image");
resolve(null);
}
} catch (e) {
reject(e);
}
};
img.onerror = () => reject(new Error("Failed to load image for analysis"));
img.src = imageUrl;
});
} catch (error) {
console.error("Analysis failed", error);
return null;
}
};