#!/bin/bash # Script to process .webm files and generate metadata using OpenAI API # Requires: dotenv-cli, jq # Usage: ./webm_metadata.sh [directory_path] set -e # Exit on any error # Get the directory where the script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Get target directory (default to current directory) TARGET_DIR="${1:-.}" TARGET_DIR="$(cd "$TARGET_DIR" && pwd)" # Convert to absolute path # Check dependencies command -v dotenv >/dev/null 2>&1 || { echo "Error: dotenv-cli is required. Install with: npm install -g dotenv-cli" >&2; exit 1; } command -v jq >/dev/null 2>&1 || { echo "Error: jq is required. Install with: apt-get install jq or brew install jq" >&2; exit 1; } # Check if .env exists in script directory ENV_FILE="$SCRIPT_DIR/.env" if [ ! -f "$ENV_FILE" ]; then echo "Error: .env file not found at $ENV_FILE" echo "Please create one with OPENAI_API_KEY=your_key" exit 1 fi echo "Script directory: $SCRIPT_DIR" echo "Target directory: $TARGET_DIR" echo "Looking for .env at: $ENV_FILE" echo "" # Configuration CSV_FILE="$TARGET_DIR/webm_metadata.csv" DELAY_SECONDS=1 # Function to check if filename already processed is_processed() { local filename="$1" grep -q "^\"$filename\"," "$CSV_FILE" 2>/dev/null } # Function to escape CSV field escape_csv() { local field="$1" # Escape double quotes and wrap in quotes echo "\"$(echo "$field" | sed 's/"/\"\"/g')\"" } # Function to make API call and extract metadata process_file() { local filename="$1" echo "Processing: $filename" # Create the system prompt text local system_prompt="You are an AI assistant that receives a meme filename. Based on the filename, your task is to generate detailed metadata describing the meme in a structured format. The metadata should include these fields: - **type**: either \`video\` or \`image\` depending on the file extension. - **sub_type**: a classification such as \`background\` or \`overlay\` (choose \`overlay\` if it seems like an edit or reaction video/image). - **name**: a concise, title-cased name derived from the filename (remove file extension and normalize, without the Meme word if exist). - **description**: a short paragraph describing the meme content and context inferred from the name, and reaction - **keywords**: a comma-separated list of relevant keywords extracted from the filename or meme context. - **media_1_mime_type**: the MIME type derived from the file extension (e.g., \`video/webm\`, \`image/png\`). Return the output as a JSON object containing these fields. Use null for any unknown or missing values. --- #### Example input: \`7th element oiiaa cat.webm\` #### Example output: \`\`\`json { \"type\": \"video\", \"sub_type\": \"overlay\", \"name\": \"7th Element Oiiaa Cat\", \"description\": \"a cat edited to mimic or react to Vitas' bizarre vocals from the viral \\\"7th Element\\\" performance.\", \"keywords\": \"cat, oiiaa, 7th element, vitas, webm, funny\", \"media_1_mime_type\": \"video/webm\" } \`\`\`" local assistant_example="\`\`\`json { \"type\": \"video\", \"sub_type\": \"overlay\", \"name\": \"Angry Cat\", \"description\": \"A video meme featuring an angry-looking cat, typically used to express frustration, annoyance, or irritation in a humorous way.\", \"keywords\": \"angry, cat, reaction, meme, webm, annoyed, frustration\", \"media_1_mime_type\": \"video/webm\" } \`\`\`" # Use jq to properly construct the JSON payload local json_payload=$(jq -n \ --arg system_prompt "$system_prompt" \ --arg filename "$filename" \ --arg assistant_example "$assistant_example" \ '{ "model": "gpt-4.1-nano", "input": [ { "role": "system", "content": [ { "type": "input_text", "text": $system_prompt } ] }, { "role": "user", "content": [ { "type": "input_text", "text": $filename } ] }, { "role": "assistant", "content": [ { "type": "output_text", "text": $assistant_example } ] } ], "text": { "format": { "type": "json_object" } }, "reasoning": {}, "tools": [], "temperature": 1, "max_output_tokens": 2048, "top_p": 1, "store": true }') # Make the API call local response=$(curl -s "https://api.openai.com/v1/responses" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $OPENAI_API_KEY" \ -d "$json_payload") # Check for API errors if echo "$response" | jq -e '.error' >/dev/null 2>&1; then echo "API Error for $filename:" echo "$response" | jq '.error' return 1 fi # Extract the response text from the correct path local response_text=$(echo "$response" | jq -r '.output[0].content[0].text // empty') if [ -z "$response_text" ]; then echo "Error: Empty response for $filename" return 1 fi # With JSON mode enabled, response should always be direct JSON local json_content="$response_text" # Validate that it's valid JSON before parsing if ! echo "$json_content" | jq . >/dev/null 2>&1; then echo "Error: Invalid JSON response for $filename" echo "Content: $json_content" return 1 fi # Parse individual fields local type=$(echo "$json_content" | jq -r '.type // "unknown"') local sub_type=$(echo "$json_content" | jq -r '.sub_type // "unknown"') local name=$(echo "$json_content" | jq -r '.name // "unknown"') local description=$(echo "$json_content" | jq -r '.description // "unknown"') local keywords=$(echo "$json_content" | jq -r '.keywords // "unknown"') local media_1_mime_type=$(echo "$json_content" | jq -r '.media_1_mime_type // "unknown"') # Write to CSV local csv_line="$(escape_csv "$filename"),$(escape_csv "$type"),$(escape_csv "$sub_type"),$(escape_csv "$name"),$(escape_csv "$description"),$(escape_csv "$keywords"),$(escape_csv "$media_1_mime_type")" echo "$csv_line" >> "$CSV_FILE" echo "✓ Successfully processed: $filename" } # Export functions so they're available in the dotenv subshell export -f is_processed export -f escape_csv export -f process_file export CSV_FILE export DELAY_SECONDS export TARGET_DIR # Change to script directory so dotenv can find the .env file cd "$SCRIPT_DIR" # Use dotenv to run the main processing logic dotenv -- bash -c ' # Check if OPENAI_API_KEY is loaded if [ -z "$OPENAI_API_KEY" ]; then echo "Error: OPENAI_API_KEY not found in environment variables" exit 1 fi echo "✓ OpenAI API key loaded successfully" echo "CSV file will be: $CSV_FILE" echo "Delay between requests: ${DELAY_SECONDS}s" echo "" # Create CSV with headers if it doesn'\''t exist if [ ! -f "$CSV_FILE" ]; then echo "filename,type,sub_type,name,description,keywords,media_1_mime_type" > "$CSV_FILE" echo "Created $CSV_FILE with headers" fi echo "Starting processing of .webm files..." # Count total files and processed files total_files=$(ls -1 "$TARGET_DIR"/*.webm 2>/dev/null | wc -l) processed_count=0 skipped_count=0 error_count=0 if [ "$total_files" -eq 0 ]; then echo "No .webm files found in $TARGET_DIR" exit 0 fi echo "Found $total_files .webm files" # Process each .webm file for webm_file in "$TARGET_DIR"/*.webm; do if [ ! -f "$webm_file" ]; then continue fi # Get just the filename (not full path) for processing filename=$(basename "$webm_file") # Check if already processed if is_processed "$filename"; then echo "⏭️ Skipping (already processed): $filename" ((skipped_count++)) continue fi # Process the file if process_file "$filename"; then ((processed_count++)) else echo "❌ Failed to process: $filename" ((error_count++)) fi # Progress update total_done=$((processed_count + skipped_count + error_count)) echo "Progress: $total_done/$total_files (Processed: $processed_count, Skipped: $skipped_count, Errors: $error_count)" echo "" # Rate limiting delay (skip on last file) if [ "$total_done" -lt "$total_files" ]; then echo "Waiting ${DELAY_SECONDS}s before next request..." sleep "$DELAY_SECONDS" fi done echo "====================" echo "Processing complete!" echo "Total files: $total_files" echo "Newly processed: $processed_count" echo "Already processed (skipped): $skipped_count" echo "Errors: $error_count" echo "CSV file: $CSV_FILE" echo "====================" '