281 lines
8.8 KiB
Bash
Executable File
281 lines
8.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Script to process .webm files and generate metadata using OpenAI API
|
|
# Requires: dotenv-cli, jq
|
|
# Usage: ./webm_metadata.sh [directory_path]
|
|
|
|
set -e # Exit on any error
|
|
|
|
# Get the directory where the script is located
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
# Get target directory (default to current directory)
|
|
TARGET_DIR="${1:-.}"
|
|
TARGET_DIR="$(cd "$TARGET_DIR" && pwd)" # Convert to absolute path
|
|
|
|
# Check dependencies
|
|
command -v dotenv >/dev/null 2>&1 || { echo "Error: dotenv-cli is required. Install with: npm install -g dotenv-cli" >&2; exit 1; }
|
|
command -v jq >/dev/null 2>&1 || { echo "Error: jq is required. Install with: apt-get install jq or brew install jq" >&2; exit 1; }
|
|
|
|
# Check if .env exists in script directory
|
|
ENV_FILE="$SCRIPT_DIR/.env"
|
|
if [ ! -f "$ENV_FILE" ]; then
|
|
echo "Error: .env file not found at $ENV_FILE"
|
|
echo "Please create one with OPENAI_API_KEY=your_key"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Script directory: $SCRIPT_DIR"
|
|
echo "Target directory: $TARGET_DIR"
|
|
echo "Looking for .env at: $ENV_FILE"
|
|
echo ""
|
|
|
|
# Configuration
|
|
CSV_FILE="$TARGET_DIR/webm_metadata.csv"
|
|
DELAY_SECONDS=1
|
|
|
|
# Function to check if filename already processed
|
|
is_processed() {
|
|
local filename="$1"
|
|
grep -q "^\"$filename\"," "$CSV_FILE" 2>/dev/null
|
|
}
|
|
|
|
# Function to escape CSV field
|
|
escape_csv() {
|
|
local field="$1"
|
|
# Escape double quotes and wrap in quotes
|
|
echo "\"$(echo "$field" | sed 's/"/\"\"/g')\""
|
|
}
|
|
|
|
# Function to make API call and extract metadata
|
|
process_file() {
|
|
local filename="$1"
|
|
echo "Processing: $filename"
|
|
|
|
# Create the system prompt text
|
|
local system_prompt="You are an AI assistant that receives a meme filename. Based on the filename, your task is to generate detailed metadata describing the meme in a structured format.
|
|
|
|
The metadata should include these fields:
|
|
- **type**: either \`video\` or \`image\` depending on the file extension.
|
|
- **sub_type**: a classification such as \`background\` or \`overlay\` (choose \`overlay\` if it seems like an edit or reaction video/image).
|
|
- **name**: a concise, title-cased name derived from the filename (remove file extension and normalize, without the Meme word if exist).
|
|
- **description**: a short paragraph describing the meme content and context inferred from the name, and reaction
|
|
- **keywords**: a comma-separated list of relevant keywords extracted from the filename or meme context.
|
|
- **media_1_mime_type**: the MIME type derived from the file extension (e.g., \`video/webm\`, \`image/png\`).
|
|
|
|
Return the output as a JSON object containing these fields. Use null for any unknown or missing values.
|
|
|
|
---
|
|
|
|
#### Example input:
|
|
\`7th element oiiaa cat.webm\`
|
|
|
|
#### Example output:
|
|
\`\`\`json
|
|
{
|
|
\"type\": \"video\",
|
|
\"sub_type\": \"overlay\",
|
|
\"name\": \"7th Element Oiiaa Cat\",
|
|
\"description\": \"a cat edited to mimic or react to Vitas' bizarre vocals from the viral \\\"7th Element\\\" performance.\",
|
|
\"keywords\": \"cat, oiiaa, 7th element, vitas, webm, funny\",
|
|
\"media_1_mime_type\": \"video/webm\"
|
|
}
|
|
\`\`\`"
|
|
|
|
local assistant_example="\`\`\`json
|
|
{
|
|
\"type\": \"video\",
|
|
\"sub_type\": \"overlay\",
|
|
\"name\": \"Angry Cat\",
|
|
\"description\": \"A video meme featuring an angry-looking cat, typically used to express frustration, annoyance, or irritation in a humorous way.\",
|
|
\"keywords\": \"angry, cat, reaction, meme, webm, annoyed, frustration\",
|
|
\"media_1_mime_type\": \"video/webm\"
|
|
}
|
|
\`\`\`"
|
|
|
|
# Use jq to properly construct the JSON payload
|
|
local json_payload=$(jq -n \
|
|
--arg system_prompt "$system_prompt" \
|
|
--arg filename "$filename" \
|
|
--arg assistant_example "$assistant_example" \
|
|
'{
|
|
"model": "gpt-4.1-nano",
|
|
"input": [
|
|
{
|
|
"role": "system",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": $system_prompt
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": $filename
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": [
|
|
{
|
|
"type": "output_text",
|
|
"text": $assistant_example
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"text": {
|
|
"format": {
|
|
"type": "json_object"
|
|
}
|
|
},
|
|
"reasoning": {},
|
|
"tools": [],
|
|
"temperature": 1,
|
|
"max_output_tokens": 2048,
|
|
"top_p": 1,
|
|
"store": true
|
|
}')
|
|
|
|
# Make the API call
|
|
local response=$(curl -s "https://api.openai.com/v1/responses" \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer $OPENAI_API_KEY" \
|
|
-d "$json_payload")
|
|
|
|
# Check for API errors
|
|
if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
|
|
echo "API Error for $filename:"
|
|
echo "$response" | jq '.error'
|
|
return 1
|
|
fi
|
|
|
|
# Extract the response text from the correct path
|
|
local response_text=$(echo "$response" | jq -r '.output[0].content[0].text // empty')
|
|
|
|
if [ -z "$response_text" ]; then
|
|
echo "Error: Empty response for $filename"
|
|
return 1
|
|
fi
|
|
|
|
# With JSON mode enabled, response should always be direct JSON
|
|
local json_content="$response_text"
|
|
|
|
# Validate that it's valid JSON before parsing
|
|
if ! echo "$json_content" | jq . >/dev/null 2>&1; then
|
|
echo "Error: Invalid JSON response for $filename"
|
|
echo "Content: $json_content"
|
|
return 1
|
|
fi
|
|
|
|
# Parse individual fields
|
|
local type=$(echo "$json_content" | jq -r '.type // "unknown"')
|
|
local sub_type=$(echo "$json_content" | jq -r '.sub_type // "unknown"')
|
|
local name=$(echo "$json_content" | jq -r '.name // "unknown"')
|
|
local description=$(echo "$json_content" | jq -r '.description // "unknown"')
|
|
local keywords=$(echo "$json_content" | jq -r '.keywords // "unknown"')
|
|
local media_1_mime_type=$(echo "$json_content" | jq -r '.media_1_mime_type // "unknown"')
|
|
|
|
# Write to CSV
|
|
local csv_line="$(escape_csv "$filename"),$(escape_csv "$type"),$(escape_csv "$sub_type"),$(escape_csv "$name"),$(escape_csv "$description"),$(escape_csv "$keywords"),$(escape_csv "$media_1_mime_type")"
|
|
echo "$csv_line" >> "$CSV_FILE"
|
|
|
|
echo "✓ Successfully processed: $filename"
|
|
}
|
|
|
|
# Export functions so they're available in the dotenv subshell
|
|
export -f is_processed
|
|
export -f escape_csv
|
|
export -f process_file
|
|
export CSV_FILE
|
|
export DELAY_SECONDS
|
|
export TARGET_DIR
|
|
|
|
# Change to script directory so dotenv can find the .env file
|
|
cd "$SCRIPT_DIR"
|
|
|
|
# Use dotenv to run the main processing logic
|
|
dotenv -- bash -c '
|
|
# Check if OPENAI_API_KEY is loaded
|
|
if [ -z "$OPENAI_API_KEY" ]; then
|
|
echo "Error: OPENAI_API_KEY not found in environment variables"
|
|
exit 1
|
|
fi
|
|
|
|
echo "✓ OpenAI API key loaded successfully"
|
|
echo "CSV file will be: $CSV_FILE"
|
|
echo "Delay between requests: ${DELAY_SECONDS}s"
|
|
echo ""
|
|
|
|
# Create CSV with headers if it doesn'\''t exist
|
|
if [ ! -f "$CSV_FILE" ]; then
|
|
echo "filename,type,sub_type,name,description,keywords,media_1_mime_type" > "$CSV_FILE"
|
|
echo "Created $CSV_FILE with headers"
|
|
fi
|
|
|
|
echo "Starting processing of .webm files..."
|
|
|
|
# Count total files and processed files
|
|
total_files=$(ls -1 "$TARGET_DIR"/*.webm 2>/dev/null | wc -l)
|
|
processed_count=0
|
|
skipped_count=0
|
|
error_count=0
|
|
|
|
if [ "$total_files" -eq 0 ]; then
|
|
echo "No .webm files found in $TARGET_DIR"
|
|
exit 0
|
|
fi
|
|
|
|
echo "Found $total_files .webm files"
|
|
|
|
# Process each .webm file
|
|
for webm_file in "$TARGET_DIR"/*.webm; do
|
|
if [ ! -f "$webm_file" ]; then
|
|
continue
|
|
fi
|
|
|
|
# Get just the filename (not full path) for processing
|
|
filename=$(basename "$webm_file")
|
|
|
|
# Check if already processed
|
|
if is_processed "$filename"; then
|
|
echo "⏭️ Skipping (already processed): $filename"
|
|
((skipped_count++))
|
|
continue
|
|
fi
|
|
|
|
# Process the file
|
|
if process_file "$filename"; then
|
|
((processed_count++))
|
|
else
|
|
echo "❌ Failed to process: $filename"
|
|
((error_count++))
|
|
fi
|
|
|
|
# Progress update
|
|
total_done=$((processed_count + skipped_count + error_count))
|
|
echo "Progress: $total_done/$total_files (Processed: $processed_count, Skipped: $skipped_count, Errors: $error_count)"
|
|
echo ""
|
|
|
|
# Rate limiting delay (skip on last file)
|
|
if [ "$total_done" -lt "$total_files" ]; then
|
|
echo "Waiting ${DELAY_SECONDS}s before next request..."
|
|
sleep "$DELAY_SECONDS"
|
|
fi
|
|
done
|
|
|
|
echo "===================="
|
|
echo "Processing complete!"
|
|
echo "Total files: $total_files"
|
|
echo "Newly processed: $processed_count"
|
|
echo "Already processed (skipped): $skipped_count"
|
|
echo "Errors: $error_count"
|
|
echo "CSV file: $CSV_FILE"
|
|
echo "===================="
|
|
'
|