Update

2025-06-08 04:39:38 +08:00
parent 862a6bb19e
commit 8d6e86ebb3
40 changed files with 838 additions and 1601 deletions
--- a/bash/webm_metadata.sh
+++ b/bash/webm_metadata.sh
@@ -0,0 +1,280 @@
+#!/bin/bash
+
+# Script to process .webm files and generate metadata using OpenAI API
+# Requires: dotenv-cli, jq
+# Usage: ./webm_metadata.sh [directory_path]
+
+set -e  # Exit on any error
+
+# Get the directory where the script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Get target directory (default to current directory)
+TARGET_DIR="${1:-.}"
+TARGET_DIR="$(cd "$TARGET_DIR" && pwd)"  # Convert to absolute path
+
+# Check dependencies
+command -v dotenv >/dev/null 2>&1 || { echo "Error: dotenv-cli is required. Install with: npm install -g dotenv-cli" >&2; exit 1; }
+command -v jq >/dev/null 2>&1 || { echo "Error: jq is required. Install with: apt-get install jq or brew install jq" >&2; exit 1; }
+
+# Check if .env exists in script directory
+ENV_FILE="$SCRIPT_DIR/.env"
+if [ ! -f "$ENV_FILE" ]; then
+    echo "Error: .env file not found at $ENV_FILE"
+    echo "Please create one with OPENAI_API_KEY=your_key"
+    exit 1
+fi
+
+echo "Script directory: $SCRIPT_DIR"
+echo "Target directory: $TARGET_DIR"
+echo "Looking for .env at: $ENV_FILE"
+echo ""
+
+# Configuration
+CSV_FILE="$TARGET_DIR/webm_metadata.csv"
+DELAY_SECONDS=1
+
+# Function to check if filename already processed
+is_processed() {
+    local filename="$1"
+    grep -q "^\"$filename\"," "$CSV_FILE" 2>/dev/null
+}
+
+# Function to escape CSV field
+escape_csv() {
+    local field="$1"
+    # Escape double quotes and wrap in quotes
+    echo "\"$(echo "$field" | sed 's/"/\"\"/g')\""
+}
+
+# Function to make API call and extract metadata
+process_file() {
+    local filename="$1"
+    echo "Processing: $filename"
+
+    # Create the system prompt text
+    local system_prompt="You are an AI assistant that receives a meme filename. Based on the filename, your task is to generate detailed metadata describing the meme in a structured format.
+
+The metadata should include these fields:
+- **type**: either \`video\` or \`image\` depending on the file extension.
+- **sub_type**: a classification such as \`background\` or \`overlay\` (choose \`overlay\` if it seems like an edit or reaction video/image).
+- **name**: a concise, title-cased name derived from the filename (remove file extension and normalize, without the Meme word if exist).
+- **description**: a short paragraph describing the meme content and context inferred from the name, and reaction
+- **keywords**: a comma-separated list of relevant keywords extracted from the filename or meme context.
+- **media_1_mime_type**: the MIME type derived from the file extension (e.g., \`video/webm\`, \`image/png\`).
+
+Return the output as a JSON object containing these fields. Use null for any unknown or missing values.
+
+---
+
+#### Example input:
+\`7th element oiiaa cat.webm\`
+
+#### Example output:
+\`\`\`json
+{
+  \"type\": \"video\",
+  \"sub_type\": \"overlay\",
+  \"name\": \"7th Element Oiiaa Cat\",
+  \"description\": \"a cat edited to mimic or react to Vitas' bizarre vocals from the viral \\\"7th Element\\\" performance.\",
+  \"keywords\": \"cat, oiiaa, 7th element, vitas, webm, funny\",
+  \"media_1_mime_type\": \"video/webm\"
+}
+\`\`\`"
+
+    local assistant_example="\`\`\`json
+{
+  \"type\": \"video\",
+  \"sub_type\": \"overlay\",
+  \"name\": \"Angry Cat\",
+  \"description\": \"A video meme featuring an angry-looking cat, typically used to express frustration, annoyance, or irritation in a humorous way.\",
+  \"keywords\": \"angry, cat, reaction, meme, webm, annoyed, frustration\",
+  \"media_1_mime_type\": \"video/webm\"
+}
+\`\`\`"
+
+    # Use jq to properly construct the JSON payload
+    local json_payload=$(jq -n \
+        --arg system_prompt "$system_prompt" \
+        --arg filename "$filename" \
+        --arg assistant_example "$assistant_example" \
+        '{
+          "model": "gpt-4.1-nano",
+          "input": [
+            {
+              "role": "system",
+              "content": [
+                {
+                  "type": "input_text",
+                  "text": $system_prompt
+                }
+              ]
+            },
+            {
+              "role": "user",
+              "content": [
+                {
+                  "type": "input_text",
+                  "text": $filename
+                }
+              ]
+            },
+            {
+              "role": "assistant",
+              "content": [
+                {
+                  "type": "output_text",
+                  "text": $assistant_example
+                }
+              ]
+            }
+          ],
+          "text": {
+            "format": {
+              "type": "json_object"
+            }
+          },
+          "reasoning": {},
+          "tools": [],
+          "temperature": 1,
+          "max_output_tokens": 2048,
+          "top_p": 1,
+          "store": true
+        }')
+
+    # Make the API call
+    local response=$(curl -s "https://api.openai.com/v1/responses" \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $OPENAI_API_KEY" \
+        -d "$json_payload")
+
+    # Check for API errors
+    if echo "$response" | jq -e '.error' >/dev/null 2>&1; then
+        echo "API Error for $filename:"
+        echo "$response" | jq '.error'
+        return 1
+    fi
+
+    # Extract the response text from the correct path
+    local response_text=$(echo "$response" | jq -r '.output[0].content[0].text // empty')
+
+    if [ -z "$response_text" ]; then
+        echo "Error: Empty response for $filename"
+        return 1
+    fi
+
+    # With JSON mode enabled, response should always be direct JSON
+    local json_content="$response_text"
+
+    # Validate that it's valid JSON before parsing
+    if ! echo "$json_content" | jq . >/dev/null 2>&1; then
+        echo "Error: Invalid JSON response for $filename"
+        echo "Content: $json_content"
+        return 1
+    fi
+
+    # Parse individual fields
+    local type=$(echo "$json_content" | jq -r '.type // "unknown"')
+    local sub_type=$(echo "$json_content" | jq -r '.sub_type // "unknown"')
+    local name=$(echo "$json_content" | jq -r '.name // "unknown"')
+    local description=$(echo "$json_content" | jq -r '.description // "unknown"')
+    local keywords=$(echo "$json_content" | jq -r '.keywords // "unknown"')
+    local media_1_mime_type=$(echo "$json_content" | jq -r '.media_1_mime_type // "unknown"')
+
+    # Write to CSV
+    local csv_line="$(escape_csv "$filename"),$(escape_csv "$type"),$(escape_csv "$sub_type"),$(escape_csv "$name"),$(escape_csv "$description"),$(escape_csv "$keywords"),$(escape_csv "$media_1_mime_type")"
+    echo "$csv_line" >> "$CSV_FILE"
+
+    echo "✓ Successfully processed: $filename"
+}
+
+# Export functions so they're available in the dotenv subshell
+export -f is_processed
+export -f escape_csv
+export -f process_file
+export CSV_FILE
+export DELAY_SECONDS
+export TARGET_DIR
+
+# Change to script directory so dotenv can find the .env file
+cd "$SCRIPT_DIR"
+
+# Use dotenv to run the main processing logic
+dotenv -- bash -c '
+# Check if OPENAI_API_KEY is loaded
+if [ -z "$OPENAI_API_KEY" ]; then
+    echo "Error: OPENAI_API_KEY not found in environment variables"
+    exit 1
+fi
+
+echo "✓ OpenAI API key loaded successfully"
+echo "CSV file will be: $CSV_FILE"
+echo "Delay between requests: ${DELAY_SECONDS}s"
+echo ""
+
+# Create CSV with headers if it doesn'\''t exist
+if [ ! -f "$CSV_FILE" ]; then
+    echo "filename,type,sub_type,name,description,keywords,media_1_mime_type" > "$CSV_FILE"
+    echo "Created $CSV_FILE with headers"
+fi
+
+echo "Starting processing of .webm files..."
+
+# Count total files and processed files
+total_files=$(ls -1 "$TARGET_DIR"/*.webm 2>/dev/null | wc -l)
+processed_count=0
+skipped_count=0
+error_count=0
+
+if [ "$total_files" -eq 0 ]; then
+    echo "No .webm files found in $TARGET_DIR"
+    exit 0
+fi
+
+echo "Found $total_files .webm files"
+
+# Process each .webm file
+for webm_file in "$TARGET_DIR"/*.webm; do
+    if [ ! -f "$webm_file" ]; then
+        continue
+    fi
+
+    # Get just the filename (not full path) for processing
+    filename=$(basename "$webm_file")
+
+    # Check if already processed
+    if is_processed "$filename"; then
+        echo "⏭️  Skipping (already processed): $filename"
+        ((skipped_count++))
+        continue
+    fi
+
+    # Process the file
+    if process_file "$filename"; then
+        ((processed_count++))
+    else
+        echo "❌ Failed to process: $filename"
+        ((error_count++))
+    fi
+
+    # Progress update
+    total_done=$((processed_count + skipped_count + error_count))
+    echo "Progress: $total_done/$total_files (Processed: $processed_count, Skipped: $skipped_count, Errors: $error_count)"
+    echo ""
+
+    # Rate limiting delay (skip on last file)
+    if [ "$total_done" -lt "$total_files" ]; then
+        echo "Waiting ${DELAY_SECONDS}s before next request..."
+        sleep "$DELAY_SECONDS"
+    fi
+done
+
+echo "===================="
+echo "Processing complete!"
+echo "Total files: $total_files"
+echo "Newly processed: $processed_count"
+echo "Already processed (skipped): $skipped_count"
+echo "Errors: $error_count"
+echo "CSV file: $CSV_FILE"
+echo "===================="
+'