feat: Initialize Gemini V-Studio project setup

Sets up the foundational project structure, including:
- Vite for build tooling.
- React for the UI.
- Tailwind CSS for styling.
- MediaPipe for face tracking capabilities.
- Gemini API integration for avatar generation.
- Basic configuration files (package.json, vite.config.ts, tsconfig.json).
- Initial README with local run instructions.
- Core types and a basic Gemini service for image generation.
This commit is contained in:
James Twose 2025-11-20 20:45:25 +01:00
parent bbd1a19152
commit b6017794a5
17 changed files with 1279 additions and 8 deletions

24
.gitignore vendored Normal file
View File

@ -0,0 +1,24 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

128
App.tsx Normal file
View File

@ -0,0 +1,128 @@
import React, { useState } from 'react';
import { AppState, AvatarConfig, Rect } from './types';
import AvatarCreator from './components/AvatarCreator';
import RiggingEditor from './components/RiggingEditor';
import Studio from './components/Studio';
const App: React.FC = () => {
const [appState, setAppState] = useState<AppState>(AppState.SETUP);
// Temp storage for the generated image before rigging
const [generatedData, setGeneratedData] = useState<{url: string, name: string, initialData?: any} | null>(null);
const [avatar, setAvatar] = useState<AvatarConfig | null>(null);
const handleStartCreation = async () => {
try {
if (window.aistudio) {
const hasKey = await window.aistudio.hasSelectedApiKey();
if (!hasKey) {
await window.aistudio.openSelectKey();
}
}
setAppState(AppState.CREATION);
} catch (error) {
console.error("Error during API key selection:", error);
setAppState(AppState.CREATION);
}
};
const handleAvatarGenerated = (url: string, name: string, initialData?: any) => {
setGeneratedData({ url, name, initialData });
setAppState(AppState.RIGGING);
};
const handleRiggingComplete = (data: { leftEye: Rect, rightEye: Rect, mouth: Rect, skinColor: string }) => {
if (generatedData) {
setAvatar({
imageUrl: generatedData.url,
name: generatedData.name,
description: '',
leftEye: data.leftEye,
rightEye: data.rightEye,
mouth: data.mouth,
skinColor: data.skinColor
});
setAppState(AppState.STUDIO);
}
};
return (
<div className="min-h-screen bg-slate-900 text-white">
{appState === AppState.SETUP && (
<div className="container mx-auto px-4 py-12 flex flex-col items-center justify-center min-h-screen">
<div className="text-center mb-12 space-y-4">
<h1 className="text-6xl font-bold text-transparent bg-clip-text bg-gradient-to-r from-cyan-400 via-blue-500 to-purple-600 brand-font tracking-tighter">
GEMINI V-STUDIO
</h1>
<p className="text-xl text-slate-400 max-w-2xl mx-auto">
The next-generation browser-based VTuber studio. Generate your persona with AI and animate it with your face.
</p>
<button
onClick={handleStartCreation}
className="mt-8 px-8 py-4 bg-white text-slate-900 rounded-full font-bold hover:bg-cyan-50 transition-colors shadow-[0_0_20px_rgba(255,255,255,0.3)]"
>
Start Creation
</button>
</div>
<div className="grid grid-cols-1 md:grid-cols-3 gap-8 w-full max-w-5xl">
<div className="p-6 bg-slate-800/50 rounded-xl border border-slate-700 backdrop-blur-sm">
<div className="h-12 w-12 bg-cyan-500/10 rounded-lg flex items-center justify-center mb-4 text-2xl"></div>
<h3 className="text-xl font-bold mb-2">AI Generation</h3>
<p className="text-slate-400">Describe your dream character. Gemini 3 Pro creates high-fidelity sprites in seconds.</p>
</div>
<div className="p-6 bg-slate-800/50 rounded-xl border border-slate-700 backdrop-blur-sm">
<div className="h-12 w-12 bg-purple-500/10 rounded-lg flex items-center justify-center mb-4 text-2xl">📸</div>
<h3 className="text-xl font-bold mb-2">Face Tracking</h3>
<p className="text-slate-400">Powered by MediaPipe. No expensive equipment neededjust your webcam.</p>
</div>
<div className="p-6 bg-slate-800/50 rounded-xl border border-slate-700 backdrop-blur-sm">
<div className="h-12 w-12 bg-pink-500/10 rounded-lg flex items-center justify-center mb-4 text-2xl">🎥</div>
<h3 className="text-xl font-bold mb-2">Live Animation</h3>
<p className="text-slate-400">Your avatar mimics your head movements and speech in real-time.</p>
</div>
</div>
</div>
)}
{appState === AppState.CREATION && (
<div className="container mx-auto px-4 py-12 min-h-screen flex flex-col">
<button
onClick={() => setAppState(AppState.SETUP)}
className="self-start mb-8 px-4 py-2 text-slate-400 hover:text-white transition-colors"
>
Back to Home
</button>
<div className="flex-1 flex items-center justify-center">
<AvatarCreator onAvatarGenerated={handleAvatarGenerated} />
</div>
</div>
)}
{appState === AppState.RIGGING && generatedData && (
<div className="container mx-auto px-4 py-8 min-h-screen flex flex-col">
<button
onClick={() => setAppState(AppState.CREATION)}
className="self-start mb-4 px-4 py-2 text-slate-400 hover:text-white transition-colors"
>
Back to Generator
</button>
<RiggingEditor
imageUrl={generatedData.url}
initialData={generatedData.initialData}
onComplete={handleRiggingComplete}
/>
</div>
)}
{appState === AppState.STUDIO && avatar && (
<Studio
avatar={avatar}
onBack={() => setAppState(AppState.SETUP)}
/>
)}
</div>
);
};
export default App;

View File

@ -1,11 +1,20 @@
<div align="center">
<img width="1200" height="475" alt="GHBanner" src="https://github.com/user-attachments/assets/0aa67016-6eaf-458a-adb2-6e31a0763ed6" />
<h1>Built with AI Studio</h2>
<p>The fastest path from prompt to production with Gemini.</p>
<a href="https://aistudio.google.com/apps">Start building</a>
</div>
# Run and deploy your AI Studio app
This contains everything you need to run your app locally.
View your app in AI Studio: https://ai.studio/apps/drive/1Di9b15uKTFXVof4InO8oefefCDaW9Q26
## Run Locally
**Prerequisites:** Node.js
1. Install dependencies:
`npm install`
2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key
3. Run the app:
`npm run dev`

View File

@ -0,0 +1,113 @@
import React, { useState } from 'react';
import { generateAvatarImage } from '../services/geminiService';
import { analyzeAvatarImage } from '../services/visionService';
import LoadingSpinner from './LoadingSpinner';
import { Rect } from '../types';
interface AvatarCreatorProps {
onAvatarGenerated: (url: string, name: string, initialData?: { leftEye: Rect, rightEye: Rect, mouth: Rect, skinColor: string }) => void;
}
const AvatarCreator: React.FC<AvatarCreatorProps> = ({ onAvatarGenerated }) => {
const [prompt, setPrompt] = useState('');
const [name, setName] = useState('');
const [status, setStatus] = useState<'idle' | 'generating' | 'analyzing'>('idle');
const [error, setError] = useState<string | null>(null);
const handleGenerate = async () => {
if (!prompt || !name) return;
setStatus('generating');
setError(null);
try {
// 1. Generate Image
const imageUrl = await generateAvatarImage(prompt);
// 2. Analyze Image for Landmarks (Initial guess)
setStatus('analyzing');
const analysisData = await analyzeAvatarImage(imageUrl);
// 3. Pass to parent (to go to Rigging)
if (analysisData) {
onAvatarGenerated(imageUrl, name, analysisData);
} else {
onAvatarGenerated(imageUrl, name);
}
} catch (err) {
console.error(err);
setError("Failed to generate avatar. Please try again.");
} finally {
setStatus('idle');
}
};
return (
<div className="max-w-2xl mx-auto bg-slate-800/50 backdrop-blur-lg border border-slate-700 p-8 rounded-2xl shadow-2xl">
<div className="text-center mb-8">
<h2 className="text-3xl font-bold text-transparent bg-clip-text bg-gradient-to-r from-cyan-400 to-purple-500 mb-2">
Design Your Avatar
</h2>
<p className="text-slate-400">
Describe your dream VTuber model and let Gemini bring it to life.
</p>
</div>
<div className="space-y-6">
<div>
<label className="block text-sm font-medium text-slate-300 mb-2">Model Name</label>
<input
type="text"
value={name}
onChange={(e) => setName(e.target.value)}
placeholder="e.g., Neon Kitsune"
className="w-full bg-slate-900/50 border border-slate-600 rounded-xl px-4 py-3 text-white placeholder-slate-500 focus:ring-2 focus:ring-cyan-500 focus:border-transparent transition-all outline-none"
/>
</div>
<div>
<label className="block text-sm font-medium text-slate-300 mb-2">Description</label>
<textarea
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
placeholder="e.g., A cyberpunk anime girl with neon blue hair, glowing headphones, wearing a futuristic jacket..."
className="w-full h-32 bg-slate-900/50 border border-slate-600 rounded-xl px-4 py-3 text-white placeholder-slate-500 focus:ring-2 focus:ring-cyan-500 focus:border-transparent transition-all outline-none resize-none"
/>
</div>
{error && (
<div className="p-3 bg-red-500/20 border border-red-500/50 rounded-lg text-red-200 text-sm">
{error}
</div>
)}
<button
onClick={handleGenerate}
disabled={status !== 'idle' || !prompt || !name}
className={`w-full py-4 rounded-xl font-bold text-lg transition-all duration-200 ${
status !== 'idle' || !prompt || !name
? 'bg-slate-700 text-slate-500 cursor-not-allowed'
: 'bg-gradient-to-r from-cyan-500 to-blue-600 hover:from-cyan-400 hover:to-blue-500 text-white shadow-lg shadow-cyan-500/25 transform hover:scale-[1.02]'
}`}
>
{status !== 'idle' ? (
<div className="flex items-center justify-center gap-3">
<LoadingSpinner />
<span>{status === 'generating' ? 'Dreaming up Avatar...' : 'Analyzing Features...'}</span>
</div>
) : (
<div className="flex items-center justify-center gap-2">
<span>Generate Model</span>
<svg xmlns="http://www.w3.org/2000/svg" className="h-5 w-5" viewBox="0 0 20 20" fill="currentColor">
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-8.707l-3-3a1 1 0 00-1.414 1.414L10.586 9H7a1 1 0 100 2h3.586l-1.293 1.293a1 1 0 101.414 1.414l3-3a1 1 0 000-1.414z" clipRule="evenodd" />
</svg>
</div>
)}
</button>
</div>
</div>
);
};
export default AvatarCreator;

View File

@ -0,0 +1,11 @@
import React from 'react';
const LoadingSpinner: React.FC = () => (
<div className="flex justify-center items-center space-x-2">
<div className="w-4 h-4 bg-cyan-500 rounded-full animate-bounce" style={{ animationDelay: '0s' }}></div>
<div className="w-4 h-4 bg-purple-500 rounded-full animate-bounce" style={{ animationDelay: '0.1s' }}></div>
<div className="w-4 h-4 bg-pink-500 rounded-full animate-bounce" style={{ animationDelay: '0.2s' }}></div>
</div>
);
export default LoadingSpinner;

View File

@ -0,0 +1,224 @@
import React, { useState, useRef, useEffect } from 'react';
import { Rect } from '../types';
interface RiggingEditorProps {
imageUrl: string;
initialData?: { leftEye: Rect; rightEye: Rect; mouth: Rect; skinColor: string };
onComplete: (data: { leftEye: Rect; rightEye: Rect; mouth: Rect; skinColor: string }) => void;
}
type ActiveFeature = 'leftEye' | 'rightEye' | 'mouth' | null;
const ResizableBox: React.FC<{
rect: Rect;
color: string;
label: string;
isActive: boolean;
onUpdate: (rect: Rect) => void;
onActivate: () => void;
}> = ({ rect, color, label, isActive, onUpdate, onActivate }) => {
const boxRef = useRef<HTMLDivElement>(null);
const [isDragging, setIsDragging] = useState(false);
const [isResizing, setIsResizing] = useState(false);
const startPos = useRef({ x: 0, y: 0 });
const startRect = useRef<Rect>({ x: 0, y: 0, w: 0, h: 0 });
const handleMouseDown = (e: React.MouseEvent) => {
e.preventDefault();
e.stopPropagation();
onActivate();
setIsDragging(true);
startPos.current = { x: e.clientX, y: e.clientY };
startRect.current = { ...rect };
};
const handleResizeDown = (e: React.MouseEvent) => {
e.preventDefault();
e.stopPropagation();
onActivate();
setIsResizing(true);
startPos.current = { x: e.clientX, y: e.clientY };
startRect.current = { ...rect };
};
useEffect(() => {
const handleMouseMove = (e: MouseEvent) => {
if (!isDragging && !isResizing) return;
const parent = boxRef.current?.parentElement;
if (!parent) return;
const parentRect = parent.getBoundingClientRect();
const deltaX = (e.clientX - startPos.current.x) / parentRect.width;
const deltaY = (e.clientY - startPos.current.y) / parentRect.height;
if (isDragging) {
onUpdate({
...rect,
x: startRect.current.x + deltaX,
y: startRect.current.y + deltaY,
});
} else if (isResizing) {
onUpdate({
...rect,
w: Math.max(0.01, startRect.current.w + deltaX),
h: Math.max(0.01, startRect.current.h + deltaY),
});
}
};
const handleMouseUp = () => {
setIsDragging(false);
setIsResizing(false);
};
if (isDragging || isResizing) {
window.addEventListener('mousemove', handleMouseMove);
window.addEventListener('mouseup', handleMouseUp);
}
return () => {
window.removeEventListener('mousemove', handleMouseMove);
window.removeEventListener('mouseup', handleMouseUp);
};
}, [isDragging, isResizing, rect, onUpdate]);
return (
<div
ref={boxRef}
onMouseDown={handleMouseDown}
className={`absolute border-2 cursor-move group transition-colors ${isActive ? 'z-20' : 'z-10'}`}
style={{
left: `${rect.x * 100}%`,
top: `${rect.y * 100}%`,
width: `${rect.w * 100}%`,
height: `${rect.h * 100}%`,
borderColor: color,
backgroundColor: isActive ? `${color}20` : 'transparent',
}}
>
{/* Label */}
<div
className="absolute -top-6 left-0 text-xs font-bold px-1 rounded text-white whitespace-nowrap"
style={{ backgroundColor: color }}
>
{label}
</div>
{/* Resize Handle */}
<div
onMouseDown={handleResizeDown}
className="absolute bottom-0 right-0 w-4 h-4 bg-white border-2 cursor-nwse-resize opacity-0 group-hover:opacity-100 transition-opacity"
style={{ borderColor: color }}
/>
</div>
);
};
const RiggingEditor: React.FC<RiggingEditorProps> = ({ imageUrl, initialData, onComplete }) => {
const [leftEye, setLeftEye] = useState<Rect>(initialData?.leftEye || { x: 0.35, y: 0.4, w: 0.12, h: 0.08 });
const [rightEye, setRightEye] = useState<Rect>(initialData?.rightEye || { x: 0.53, y: 0.4, w: 0.12, h: 0.08 });
const [mouth, setMouth] = useState<Rect>(initialData?.mouth || { x: 0.45, y: 0.6, w: 0.1, h: 0.05 });
const [skinColor, setSkinColor] = useState<string>(initialData?.skinColor || '#fcd3bf');
const [activeFeature, setActiveFeature] = useState<ActiveFeature>(null);
return (
<div className="flex flex-col items-center h-full max-w-4xl mx-auto p-4">
<div className="text-center mb-6">
<h2 className="text-2xl font-bold text-white mb-2">Rig Your Avatar</h2>
<p className="text-slate-400">
Drag and resize the boxes to match your avatar's features.
This ensures the eyes blink correctly.
</p>
</div>
<div className="flex gap-8 w-full items-start">
{/* Editor Area */}
<div className="flex-1 bg-slate-800 p-4 rounded-xl border border-slate-700 flex justify-center">
<div className="relative inline-block select-none" style={{ width: '500px', maxWidth: '100%' }}>
<img
src={imageUrl}
alt="Rigging Target"
className="w-full h-auto rounded-lg pointer-events-none select-none block"
draggable={false}
/>
<ResizableBox
rect={leftEye}
color="#ef4444" // Red
label="Left Eye"
isActive={activeFeature === 'leftEye'}
onUpdate={setLeftEye}
onActivate={() => setActiveFeature('leftEye')}
/>
<ResizableBox
rect={rightEye}
color="#3b82f6" // Blue
label="Right Eye"
isActive={activeFeature === 'rightEye'}
onUpdate={setRightEye}
onActivate={() => setActiveFeature('rightEye')}
/>
<ResizableBox
rect={mouth}
color="#22c55e" // Green
label="Mouth"
isActive={activeFeature === 'mouth'}
onUpdate={setMouth}
onActivate={() => setActiveFeature('mouth')}
/>
</div>
</div>
{/* Sidebar Controls */}
<div className="w-64 flex flex-col gap-6 bg-slate-800/50 p-6 rounded-xl border border-slate-700 h-full">
<div>
<label className="block text-sm font-medium text-slate-300 mb-2">Eyelid Color</label>
<div className="flex items-center gap-3">
<input
type="color"
value={skinColor}
onChange={(e) => setSkinColor(e.target.value)}
className="w-10 h-10 rounded cursor-pointer border-0 p-0"
/>
<span className="text-xs text-slate-400 font-mono">{skinColor}</span>
</div>
<p className="text-xs text-slate-500 mt-2">
Pick the color of the skin above the eyes for realistic blinking.
</p>
</div>
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm text-slate-300">
<div className="w-3 h-3 bg-red-500 rounded-full"></div>
<span>Left Eye Box</span>
</div>
<div className="flex items-center gap-2 text-sm text-slate-300">
<div className="w-3 h-3 bg-blue-500 rounded-full"></div>
<span>Right Eye Box</span>
</div>
<div className="flex items-center gap-2 text-sm text-slate-300">
<div className="w-3 h-3 bg-green-500 rounded-full"></div>
<span>Mouth Box</span>
</div>
</div>
<div className="mt-auto pt-6">
<button
onClick={() => onComplete({ leftEye, rightEye, mouth, skinColor })}
className="w-full py-3 bg-gradient-to-r from-cyan-500 to-blue-600 hover:from-cyan-400 hover:to-blue-500 text-white rounded-xl font-bold shadow-lg shadow-cyan-500/25 transform hover:scale-[1.02] transition-all"
>
Finish Rigging
</button>
</div>
</div>
</div>
</div>
);
};
export default RiggingEditor;

259
components/Studio.tsx Normal file
View File

@ -0,0 +1,259 @@
import React, { useEffect, useRef, useState } from 'react';
import { useFaceTracking } from '../hooks/useFaceTracking';
import { AvatarConfig } from '../types';
interface StudioProps {
avatar: AvatarConfig;
onBack: () => void;
}
const Studio: React.FC<StudioProps> = ({ avatar, onBack }) => {
const videoRef = useRef<HTMLVideoElement>(null);
const [cameraReady, setCameraReady] = useState(false);
// We use the custom hook to get tracking data
const { trackingData, isLoading: isModelLoading, startTracking } = useFaceTracking(videoRef.current);
// Initialize Camera
useEffect(() => {
const startCamera = async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
video: { width: 640, height: 480 }, // Lower res is fine for tracking
audio: false
});
if (videoRef.current) {
videoRef.current.srcObject = stream;
videoRef.current.onloadeddata = () => {
setCameraReady(true);
};
}
} catch (err) {
console.error("Error accessing camera:", err);
alert("Could not access camera. Please ensure permissions are granted.");
}
};
startCamera();
return () => {
// Cleanup stream
if (videoRef.current && videoRef.current.srcObject) {
const stream = videoRef.current.srcObject as MediaStream;
stream.getTracks().forEach(track => track.stop());
}
};
}, []);
// Start tracking when both camera and model are ready
useEffect(() => {
if (cameraReady && !isModelLoading) {
startTracking();
}
}, [cameraReady, isModelLoading, startTracking]);
// Calculate styles based on tracking data
const getAvatarStyle = () => {
// Deadzone for jitter reduction
const smooth = (val: number) => Math.abs(val) < 0.02 ? 0 : val;
const rX = smooth(trackingData.rotationX); // Pitch
const rY = smooth(trackingData.rotationY); // Yaw
const rZ = smooth(trackingData.rotationZ); // Roll
const tX = smooth(trackingData.translationX);
const tY = smooth(trackingData.translationY);
// Bounce effect on mouth open (Speaking emulation)
const bounce = trackingData.mouthOpen > 0.1 ? -5 * trackingData.mouthOpen : 0;
return {
transform: `
translate(${tX * 150}px, ${tY * 100 + bounce}px)
rotate(${rZ * 1}rad)
perspective(500px)
rotateX(${rX * 15}deg)
rotateY(${rY * -25}deg)
scale(${1 + trackingData.mouthOpen * 0.02})
`,
filter: `brightness(${1 + trackingData.mouthOpen * 0.05})`, // Slight flash when speaking
transition: 'transform 0.1s ease-out, filter 0.1s ease'
};
};
return (
<div className="h-screen w-full flex flex-col bg-slate-900 overflow-hidden relative">
{/* Hidden Video Element for Tracking */}
<video
ref={videoRef}
autoPlay
playsInline
muted
className="absolute opacity-0 pointer-events-none w-1 h-1"
/>
{/* Top Bar */}
<div className="absolute top-0 left-0 right-0 z-20 p-4 flex justify-between items-center bg-gradient-to-b from-slate-900 to-transparent">
<button
onClick={onBack}
className="px-4 py-2 bg-slate-800/80 hover:bg-slate-700 backdrop-blur rounded-lg text-white font-medium transition-colors border border-slate-600"
>
Exit Studio
</button>
<div className="flex gap-2">
<div className={`px-3 py-1 rounded-full text-xs font-bold flex items-center gap-2 ${isModelLoading ? 'bg-yellow-500/20 text-yellow-400' : 'bg-green-500/20 text-green-400'}`}>
<span className={`w-2 h-2 rounded-full ${isModelLoading ? 'bg-yellow-400 animate-pulse' : 'bg-green-400'}`}></span>
{isModelLoading ? 'Loading Vision Model...' : 'Tracking Active'}
</div>
<div className="px-3 py-1 rounded-full text-xs font-bold bg-purple-500/20 text-purple-400 border border-purple-500/30">
{avatar.name}
</div>
</div>
</div>
{/* Main Stage */}
<div className="flex-1 relative flex items-center justify-center overflow-hidden">
{/* Background Grid/Effect */}
<div className="absolute inset-0 opacity-20"
style={{
backgroundImage: 'radial-gradient(#4f46e5 1px, transparent 1px)',
backgroundSize: '30px 30px'
}}>
</div>
<div className="absolute inset-0 bg-gradient-to-t from-slate-900 via-transparent to-slate-900 pointer-events-none"></div>
{/* Avatar Container */}
<div className="relative w-[600px] h-[600px] flex items-center justify-center z-10">
<div
className="relative w-full h-full flex items-center justify-center"
style={getAvatarStyle()}
>
<img
src={avatar.imageUrl}
alt="Avatar"
className="w-full h-full object-contain drop-shadow-[0_0_15px_rgba(168,85,247,0.5)]"
/>
{/* Dynamic Eyelids */}
{avatar.leftEye && avatar.skinColor && (
<div
className="absolute pointer-events-none"
style={{
left: `${avatar.leftEye.x * 100}%`,
top: `${avatar.leftEye.y * 100}%`,
width: `${avatar.leftEye.w * 100}%`,
height: `${avatar.leftEye.h * 100}%`,
backgroundColor: avatar.skinColor,
transform: `scaleY(${trackingData.isBlinkingLeft ? 1 : 0})`,
transformOrigin: 'top',
transition: 'transform 0.1s cubic-bezier(0.4, 0, 0.2, 1)', // Snappy blink
borderRadius: '0 0 40% 40%'
}}
/>
)}
{avatar.rightEye && avatar.skinColor && (
<div
className="absolute pointer-events-none"
style={{
left: `${avatar.rightEye.x * 100}%`,
top: `${avatar.rightEye.y * 100}%`,
width: `${avatar.rightEye.w * 100}%`,
height: `${avatar.rightEye.h * 100}%`,
backgroundColor: avatar.skinColor,
transform: `scaleY(${trackingData.isBlinkingRight ? 1 : 0})`,
transformOrigin: 'top',
transition: 'transform 0.1s cubic-bezier(0.4, 0, 0.2, 1)', // Snappy blink
borderRadius: '0 0 40% 40%'
}}
/>
)}
{/* Dynamic Mouth Animation */}
{avatar.mouth && (
<div
className="absolute pointer-events-none flex items-center justify-center z-10"
style={{
left: `${avatar.mouth.x * 100}%`,
top: `${avatar.mouth.y * 100}%`,
width: `${avatar.mouth.w * 100}%`,
height: `${avatar.mouth.h * 100}%`,
}}
>
{/* Skin Patch - Hides the static closed mouth when speaking */}
<div
className="absolute w-[120%] h-[120%] transition-opacity duration-75"
style={{
backgroundColor: avatar.skinColor || '#fcd3bf',
opacity: trackingData.mouthOpen > 0.1 ? 1 : 0,
filter: 'blur(3px)', // Blends edges
borderRadius: '40%'
}}
/>
{/* Mouth Interior - Scales based on mouth openness */}
<div
className="relative w-full h-full bg-[#4a1212] border-2 border-[#2d0a0a] overflow-hidden origin-center transition-transform duration-75"
style={{
borderRadius: '50% 50% 50% 50% / 50% 50% 30% 30%', // Slightly more jaw-like shape
// trackingData.mouthOpen is 0-1. We amplify it for better visuals.
transform: `scaleY(${Math.min(1.2, trackingData.mouthOpen * 4)}) scaleX(${0.9 + trackingData.mouthOpen * 0.1})`,
opacity: trackingData.mouthOpen > 0.05 ? 1 : 0,
}}
>
{/* Tongue */}
<div
className="absolute bottom-[-20%] left-1/2 -translate-x-1/2 w-[80%] h-[60%] bg-[#d45d5d] rounded-t-full"
/>
</div>
</div>
)}
</div>
{/* Optional: Status Indicator overlay if tracking is lost (all 0s usually) or just visual flair */}
{(!cameraReady) && (
<div className="absolute inset-0 flex items-center justify-center bg-slate-900/80 z-20 rounded-xl backdrop-blur-sm">
<div className="text-cyan-400 animate-pulse font-mono">INITIALIZING CAMERA LINK...</div>
</div>
)}
</div>
</div>
{/* Control Deck */}
<div className="h-24 bg-slate-800 border-t border-slate-700 p-4 flex justify-center items-center gap-6 z-20">
<div className="flex flex-col items-center">
<span className="text-xs text-slate-400 mb-1 font-mono">MOUTH</span>
<div className="w-24 h-2 bg-slate-700 rounded-full overflow-hidden">
<div className="h-full bg-cyan-400 transition-all duration-75" style={{ width: `${Math.min(trackingData.mouthOpen * 100, 100)}%` }}></div>
</div>
</div>
<div className="flex flex-col items-center">
<span className="text-xs text-slate-400 mb-1 font-mono">HEAD ROLL</span>
<div className="w-24 h-2 bg-slate-700 rounded-full overflow-hidden flex justify-center relative">
{/* Center marker */}
<div className="absolute w-[1px] h-full bg-slate-500 left-1/2"></div>
<div
className="h-full bg-purple-500 transition-all duration-75 absolute"
style={{
width: `${Math.abs(trackingData.rotationZ * 50)}%`,
left: trackingData.rotationZ < 0 ? 'auto' : '50%',
right: trackingData.rotationZ < 0 ? '50%' : 'auto'
}}
></div>
</div>
</div>
<div className="flex flex-col items-center">
<span className="text-xs text-slate-400 mb-1 font-mono">BLINK</span>
<div className="flex gap-2">
<div className={`w-8 h-2 rounded-full ${trackingData.isBlinkingLeft ? 'bg-pink-500' : 'bg-slate-700'}`}></div>
<div className={`w-8 h-2 rounded-full ${trackingData.isBlinkingRight ? 'bg-pink-500' : 'bg-slate-700'}`}></div>
</div>
</div>
</div>
</div>
);
};
export default Studio;

140
hooks/useFaceTracking.ts Normal file
View File

@ -0,0 +1,140 @@
import { useEffect, useRef, useState, useCallback } from 'react';
import { FaceLandmarker, FilesetResolver, DrawingUtils } from '@mediapipe/tasks-vision';
import { TrackingData } from '../types';
export const useFaceTracking = (videoElement: HTMLVideoElement | null) => {
const [isTracking, setIsTracking] = useState(false);
const [isLoading, setIsLoading] = useState(true);
const faceLandmarkerRef = useRef<FaceLandmarker | null>(null);
const requestRef = useRef<number | null>(null);
const lastVideoTimeRef = useRef<number>(-1);
const [trackingData, setTrackingData] = useState<TrackingData>({
rotationX: 0,
rotationY: 0,
rotationZ: 0,
translationX: 0,
translationY: 0,
mouthOpen: 0,
isBlinkingLeft: false,
isBlinkingRight: false,
});
// Initialize FaceLandmarker
useEffect(() => {
const initMediaPipe = async () => {
try {
// Use specific version to match index.html import and prevent version mismatch
const filesetResolver = await FilesetResolver.forVisionTasks(
"https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/wasm"
);
faceLandmarkerRef.current = await FaceLandmarker.createFromOptions(filesetResolver, {
baseOptions: {
modelAssetPath: `https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task`,
delegate: "GPU"
},
outputFaceBlendshapes: true,
outputFacialTransformationMatrixes: true,
runningMode: "VIDEO",
numFaces: 1
});
setIsLoading(false);
} catch (error) {
console.error("Failed to load MediaPipe:", error);
setIsLoading(false);
}
};
initMediaPipe();
return () => {
faceLandmarkerRef.current?.close();
};
}, []);
const predict = useCallback(() => {
if (!faceLandmarkerRef.current || !videoElement) return;
// Only predict if video is ready and playing
if (videoElement.readyState < 2) return;
const nowInMs = Date.now();
if (lastVideoTimeRef.current !== videoElement.currentTime) {
lastVideoTimeRef.current = videoElement.currentTime;
const results = faceLandmarkerRef.current.detectForVideo(videoElement, nowInMs);
if (results.faceLandmarks && results.faceLandmarks.length > 0) {
// 1. Extract Blendshapes for Expression
const blendshapes = results.faceBlendshapes?.[0]?.categories;
let mouthOpen = 0;
let eyeBlinkLeft = 0;
let eyeBlinkRight = 0;
if (blendshapes) {
mouthOpen = blendshapes.find(c => c.categoryName === 'jawOpen')?.score || 0;
eyeBlinkLeft = blendshapes.find(c => c.categoryName === 'eyeBlinkLeft')?.score || 0;
eyeBlinkRight = blendshapes.find(c => c.categoryName === 'eyeBlinkRight')?.score || 0;
}
// 2. Estimate Pose (simplified)
// MediaPipe gives a matrix, but often for 2D avatars, simple landmark delta is cleaner.
// We use specific landmarks to calculate roll, yaw, pitch approximation.
const landmarks = results.faceLandmarks[0];
// Roll: Angle between eyes
const leftEye = landmarks[33]; // Outer left eye
const rightEye = landmarks[263]; // Outer right eye
const dy = rightEye.y - leftEye.y;
const dx = rightEye.x - leftEye.x;
const roll = Math.atan2(dy, dx);
// Yaw: Nose offset from center of eyes
const nose = landmarks[1];
const midPointX = (leftEye.x + rightEye.x) / 2;
const yaw = (nose.x - midPointX) * 2; // sensitivity
// Pitch: Nose offset vertical
const midPointY = (leftEye.y + rightEye.y) / 2;
const pitch = (nose.y - midPointY) * 2;
// Translation
const transX = (nose.x - 0.5) * 2;
const transY = (nose.y - 0.5) * 2;
setTrackingData({
rotationZ: roll,
rotationY: yaw,
rotationX: pitch,
translationX: transX,
translationY: transY,
mouthOpen,
isBlinkingLeft: eyeBlinkLeft > 0.5,
isBlinkingRight: eyeBlinkRight > 0.5
});
}
}
requestRef.current = requestAnimationFrame(predict);
}, [videoElement]);
const startTracking = useCallback(() => {
setIsTracking(true);
requestRef.current = requestAnimationFrame(predict);
}, [predict]);
const stopTracking = useCallback(() => {
setIsTracking(false);
if (requestRef.current) {
cancelAnimationFrame(requestRef.current);
}
}, []);
return {
isLoading,
trackingData,
startTracking,
stopTracking
};
};

45
index.html Normal file
View File

@ -0,0 +1,45 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Gemini V-Studio</title>
<script src="https://cdn.tailwindcss.com"></script>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=Space+Grotesk:wght@400;700&display=swap" rel="stylesheet">
<style>
body {
font-family: 'Inter', sans-serif;
background-color: #0f172a;
color: #f8fafc;
}
h1, h2, h3, .brand-font {
font-family: 'Space Grotesk', sans-serif;
}
/* Hide scrollbar for cleaner UI */
::-webkit-scrollbar {
width: 8px;
}
::-webkit-scrollbar-track {
background: #0f172a;
}
::-webkit-scrollbar-thumb {
background: #334155;
border-radius: 4px;
}
</style>
<script type="importmap">
{
"imports": {
"react/": "https://aistudiocdn.com/react@^19.2.0/",
"react": "https://aistudiocdn.com/react@^19.2.0",
"react-dom/": "https://aistudiocdn.com/react-dom@^19.2.0/",
"@google/genai": "https://aistudiocdn.com/@google/genai@^1.30.0",
"@mediapipe/tasks-vision": "https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/+esm"
}
}
</script>
</head>
<body>
<div id="root"></div>
</body>
</html>

15
index.tsx Normal file
View File

@ -0,0 +1,15 @@
import React from 'react';
import ReactDOM from 'react-dom/client';
import App from './App';
const rootElement = document.getElementById('root');
if (!rootElement) {
throw new Error("Could not find root element to mount to");
}
const root = ReactDOM.createRoot(rootElement);
root.render(
<React.StrictMode>
<App />
</React.StrictMode>
);

7
metadata.json Normal file
View File

@ -0,0 +1,7 @@
{
"name": "Gemini V-Studio",
"description": "Create your own VTuber avatar using Gemini 3 Pro and animate it in real-time using MediaPipe face tracking.",
"requestFramePermissions": [
"camera"
]
}

23
package.json Normal file
View File

@ -0,0 +1,23 @@
{
"name": "gemini-v-studio",
"private": true,
"version": "0.0.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "vite build",
"preview": "vite preview"
},
"dependencies": {
"react": "^19.2.0",
"react-dom": "^19.2.0",
"@google/genai": "^1.30.0",
"@mediapipe/tasks-vision": "0.10.18"
},
"devDependencies": {
"@types/node": "^22.14.0",
"@vitejs/plugin-react": "^5.0.0",
"typescript": "~5.8.2",
"vite": "^6.2.0"
}
}

53
services/geminiService.ts Normal file
View File

@ -0,0 +1,53 @@
import { GoogleGenAI } from "@google/genai";
/**
* Generates a VTuber avatar image based on user description.
* Uses gemini-3-pro-image-preview for high quality.
*/
export const generateAvatarImage = async (description: string): Promise<string> => {
try {
// Initialize client inside the function to ensure we use the most up-to-date API key
// after the user has completed the selection flow.
const ai = new GoogleGenAI({ apiKey: process.env.API_KEY });
// We construct a prompt that encourages a good format for a 2D avatar (front facing, clean background)
const prompt = `
Create a high-quality, flat 2D anime or stylized character illustration suitable for a VTuber avatar.
The character should be facing forward (front view).
The background should be a solid, single color (white or bright green) to allow for easy removal or masking.
Character Description: ${description}
Style: Vibrant, clean lines, detailed eyes.
Focus: Head and shoulders only.
`;
const response = await ai.models.generateContent({
model: 'gemini-3-pro-image-preview',
contents: {
parts: [
{ text: prompt }
]
},
config: {
imageConfig: {
aspectRatio: "1:1",
imageSize: "1K"
}
}
});
// Parse response for image data
for (const part of response.candidates[0].content.parts) {
if (part.inlineData) {
const base64EncodeString = part.inlineData.data;
return `data:image/png;base64,${base64EncodeString}`;
}
}
throw new Error("No image data found in response");
} catch (error) {
console.error("Error generating avatar:", error);
throw error;
}
};

128
services/visionService.ts Normal file
View File

@ -0,0 +1,128 @@
import { FaceLandmarker, FilesetResolver } from '@mediapipe/tasks-vision';
import { Rect } from '../types';
let faceLandmarker: FaceLandmarker | null = null;
// Initialize the vision model for static image analysis
const initVision = async () => {
if (faceLandmarker) return;
try {
const filesetResolver = await FilesetResolver.forVisionTasks(
"https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@0.10.18/wasm"
);
faceLandmarker = await FaceLandmarker.createFromOptions(filesetResolver, {
baseOptions: {
modelAssetPath: `https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task`,
delegate: "GPU"
},
runningMode: "IMAGE",
numFaces: 1
});
} catch (e) {
console.error("Failed to initialize vision service:", e);
}
};
export const analyzeAvatarImage = async (imageUrl: string): Promise<{ leftEye: Rect, rightEye: Rect, mouth: Rect, skinColor: string } | null> => {
try {
await initVision();
if (!faceLandmarker) return null;
return new Promise((resolve, reject) => {
const img = new Image();
img.crossOrigin = "anonymous";
img.onload = () => {
try {
const result = faceLandmarker!.detect(img);
if (result.faceLandmarks && result.faceLandmarks.length > 0) {
const landmarks = result.faceLandmarks[0];
// Helper to calculate bounding box from landmark indices
const getRect = (indices: number[]): Rect => {
let minX = 1, minY = 1, maxX = 0, maxY = 0;
indices.forEach(i => {
const l = landmarks[i];
if (l.x < minX) minX = l.x;
if (l.x > maxX) maxX = l.x;
if (l.y < minY) minY = l.y;
if (l.y > maxY) maxY = l.y;
});
const w = maxX - minX;
const h = maxY - minY;
// Expand slightly to cover the area comfortably
const paddingX = w * 0.1;
const paddingY = h * 0.1;
return {
x: minX - paddingX,
y: minY - paddingY,
w: w + (paddingX * 2),
h: h + (paddingY * 2),
};
};
// MediaPipe Mesh Indices
const leftEyeIndices = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246];
const rightEyeIndices = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398];
const mouthIndices = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 375, 321, 405, 314, 17, 84, 181, 91, 146];
const leftRect = getRect(leftEyeIndices);
const rightRect = getRect(rightEyeIndices);
const mouthRect = getRect(mouthIndices);
// Sample Skin Color
const canvas = document.createElement('canvas');
canvas.width = img.width;
canvas.height = img.height;
const ctx = canvas.getContext('2d');
let color = '#fcd3bf'; // Default fallback
if (ctx) {
ctx.drawImage(img, 0, 0);
// Landmark 123 is on the left cheek bone area
const sampleIdx = 123;
const lx = Math.floor(landmarks[sampleIdx].x * img.width);
const ly = Math.floor(landmarks[sampleIdx].y * img.height);
if (lx >= 0 && lx < img.width && ly >= 0 && ly < img.height) {
const pixel = ctx.getImageData(lx, ly, 1, 1).data;
// Convert rgb to hex for input type="color"
const toHex = (c: number) => {
const hex = c.toString(16);
return hex.length === 1 ? "0" + hex : hex;
};
color = `#${toHex(pixel[0])}${toHex(pixel[1])}${toHex(pixel[2])}`;
}
}
resolve({
leftEye: leftRect,
rightEye: rightRect,
mouth: mouthRect,
skinColor: color
});
} else {
console.warn("No face detected in generated image");
resolve(null);
}
} catch (e) {
reject(e);
}
};
img.onerror = () => reject(new Error("Failed to load image for analysis"));
img.src = imageUrl;
});
} catch (error) {
console.error("Analysis failed", error);
return null;
}
};

29
tsconfig.json Normal file
View File

@ -0,0 +1,29 @@
{
"compilerOptions": {
"target": "ES2022",
"experimentalDecorators": true,
"useDefineForClassFields": false,
"module": "ESNext",
"lib": [
"ES2022",
"DOM",
"DOM.Iterable"
],
"skipLibCheck": true,
"types": [
"node"
],
"moduleResolution": "bundler",
"isolatedModules": true,
"moduleDetection": "force",
"allowJs": true,
"jsx": "react-jsx",
"paths": {
"@/*": [
"./*"
]
},
"allowImportingTsExtensions": true,
"noEmit": true
}
}

40
types.ts Normal file
View File

@ -0,0 +1,40 @@
export enum AppState {
SETUP = 'SETUP',
CREATION = 'CREATION',
RIGGING = 'RIGGING',
STUDIO = 'STUDIO',
}
export interface Rect {
x: number;
y: number;
w: number;
h: number;
}
export interface AvatarConfig {
imageUrl: string;
name: string;
description: string;
leftEye?: Rect;
rightEye?: Rect;
mouth?: Rect;
skinColor?: string;
}
export interface TrackingData {
rotationX: number; // Pitch
rotationY: number; // Yaw
rotationZ: number; // Roll
translationX: number;
translationY: number;
mouthOpen: number;
isBlinkingLeft: boolean;
isBlinkingRight: boolean;
}
export interface AIStudio {
hasSelectedApiKey(): Promise<boolean>;
openSelectKey(): Promise<void>;
}

23
vite.config.ts Normal file
View File

@ -0,0 +1,23 @@
import path from 'path';
import { defineConfig, loadEnv } from 'vite';
import react from '@vitejs/plugin-react';
export default defineConfig(({ mode }) => {
const env = loadEnv(mode, '.', '');
return {
server: {
port: 3000,
host: '0.0.0.0',
},
plugins: [react()],
define: {
'process.env.API_KEY': JSON.stringify(env.GEMINI_API_KEY),
'process.env.GEMINI_API_KEY': JSON.stringify(env.GEMINI_API_KEY)
},
resolve: {
alias: {
'@': path.resolve(__dirname, '.'),
}
}
};
});