This commit is contained in:
ct
2025-08-11 02:35:35 +08:00
parent 4a80723243
commit f3c91b9a64
24 changed files with 2035 additions and 214 deletions

View File

@@ -23,7 +23,9 @@ public function crawl(Request $request): JsonResponse
'block_ads' => 'boolean',
'block_cookie_banners' => 'boolean',
'block_trackers' => 'boolean',
'wait_until_network_idle' => 'boolean'
'webhook_url' => 'nullable|url|max:2048',
'webhook_events_filter' => 'nullable|array',
'webhook_events_filter.*' => 'in:queued,processing,completed,failed'
]);
$uuid = Str::uuid()->toString();
@@ -33,13 +35,15 @@ public function crawl(Request $request): JsonResponse
'type' => 'crawl',
'url' => $validated['url'],
'status' => 'queued',
'webhook_url' => $validated['webhook_url'] ?? null,
'webhook_events_filter' => isset($validated['webhook_events_filter']) ? $validated['webhook_events_filter'] : ['queued', 'processing', 'completed', 'failed'],
'parameters' => array_filter([
'timeout' => $validated['timeout'] ?? 30,
'delay' => $validated['delay'] ?? 0,
'block_ads' => $validated['block_ads'] ?? true,
'block_cookie_banners' => $validated['block_cookie_banners'] ?? true,
'block_trackers' => $validated['block_trackers'] ?? true,
'wait_until_network_idle' => $validated['wait_until_network_idle'] ?? false
'block_trackers' => $validated['block_trackers'] ?? true
// wait_until_network_idle is always enabled in BrowsershotService
])
]);

View File

@@ -24,7 +24,10 @@ public function shot(Request $request): JsonResponse
'delay' => 'integer|min:0|max:30000',
'block_ads' => 'boolean',
'block_cookie_banners' => 'boolean',
'block_trackers' => 'boolean'
'block_trackers' => 'boolean',
'webhook_url' => 'nullable|url|max:2048',
'webhook_events_filter' => 'nullable|array',
'webhook_events_filter.*' => 'in:queued,processing,completed,failed'
]);
$uuid = Str::uuid()->toString();
@@ -34,6 +37,8 @@ public function shot(Request $request): JsonResponse
'type' => 'shot',
'url' => $validated['url'],
'status' => 'queued',
'webhook_url' => $validated['webhook_url'] ?? null,
'webhook_events_filter' => isset($validated['webhook_events_filter']) ? $validated['webhook_events_filter'] : ['queued', 'processing', 'completed', 'failed'],
'parameters' => array_filter([
'viewport_width' => $validated['viewport_width'] ?? 1920,
'viewport_height' => $validated['viewport_height'] ?? 1080,

View File

@@ -0,0 +1,72 @@
<?php
namespace App\Http\Controllers\Api;
use App\Http\Controllers\Controller;
use App\Models\CrawlShotJob;
use App\Services\WebhookService;
use Illuminate\Http\JsonResponse;
class WebhookErrorController extends Controller
{
public function index(): JsonResponse
{
$jobs = CrawlShotJob::where('webhook_attempts', '>', 0)
->whereNotNull('webhook_url')
->orderBy('updated_at', 'desc')
->paginate(20);
$response = [
'jobs' => $jobs->items(),
'pagination' => [
'current_page' => $jobs->currentPage(),
'total_pages' => $jobs->lastPage(),
'total_items' => $jobs->total(),
'per_page' => $jobs->perPage()
]
];
return response()->json($response);
}
public function retry(string $uuid): JsonResponse
{
$job = CrawlShotJob::where('uuid', $uuid)->first();
if (!$job) {
return response()->json(['error' => 'Job not found'], 404);
}
if (!$job->webhook_url) {
return response()->json(['error' => 'Job has no webhook URL'], 400);
}
// Attempt webhook immediately
WebhookService::send($job);
return response()->json([
'uuid' => $job->uuid,
'message' => 'Webhook retry attempted'
]);
}
public function clear(string $uuid): JsonResponse
{
$job = CrawlShotJob::where('uuid', $uuid)->first();
if (!$job) {
return response()->json(['error' => 'Job not found'], 404);
}
$job->update([
'webhook_attempts' => 0,
'webhook_last_error' => null,
'webhook_next_retry_at' => null
]);
return response()->json([
'uuid' => $job->uuid,
'message' => 'Webhook error cleared'
]);
}
}

View File

@@ -0,0 +1,40 @@
<?php
namespace App\Jobs;
use App\Models\CrawlShotJob;
use App\Services\WebhookService;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
class RetryWebhookJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected string $jobUuid;
public function __construct(string $jobUuid)
{
$this->jobUuid = $jobUuid;
}
public function handle(): void
{
$job = CrawlShotJob::where('uuid', $this->jobUuid)->first();
if (!$job || !$job->webhook_url) {
return;
}
// Check if job still needs retry (in case it was manually cleared)
if (!$job->webhook_next_retry_at || $job->webhook_next_retry_at->isFuture()) {
return;
}
// Attempt webhook again
WebhookService::send($job);
}
}

View File

@@ -4,6 +4,7 @@
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Support\Facades\Storage;
class CrawlShotJob extends Model
{
@@ -18,17 +19,72 @@ class CrawlShotJob extends Model
'file_path',
'error_message',
'started_at',
'completed_at'
'completed_at',
'webhook_url',
'webhook_events_filter',
'webhook_attempts',
'webhook_last_error',
'webhook_next_retry_at'
];
protected $casts = [
'parameters' => 'array',
'webhook_events_filter' => 'array',
'started_at' => 'datetime',
'completed_at' => 'datetime'
'completed_at' => 'datetime',
'webhook_next_retry_at' => 'datetime'
];
public function getRouteKeyName()
{
return 'uuid';
}
public function buildStatusResponse(): array
{
$response = [
'uuid' => $this->uuid,
'status' => $this->status,
'url' => $this->url,
'created_at' => $this->created_at->toISOString()
];
if ($this->started_at) {
$response['started_at'] = $this->started_at->toISOString();
}
if ($this->completed_at) {
$response['completed_at'] = $this->completed_at->toISOString();
}
if ($this->status === 'completed' && $this->file_path) {
if ($this->type === 'crawl') {
$response['result'] = [
'html' => [
'url' => url("/api/crawl/{$this->uuid}.html"),
'raw' => Storage::get($this->file_path)
]
];
} elseif ($this->type === 'shot') {
$imageData = Storage::get($this->file_path);
$response['result'] = [
'image' => [
'url' => url("/api/shot/{$this->uuid}.webp"),
'raw' => base64_encode($imageData),
],
'mime_type' => 'image/webp',
'format' => 'webp',
'width' => $this->parameters['viewport_width'] ?? 1920,
'height' => $this->parameters['viewport_height'] ?? 1080,
'size' => strlen($imageData)
];
}
}
if ($this->status === 'failed' && $this->error_message) {
$response['error'] = $this->error_message;
}
return $response;
}
}

View File

@@ -0,0 +1,27 @@
<?php
namespace App\Observers;
use App\Models\CrawlShotJob;
use App\Services\WebhookService;
class CrawlShotJobObserver
{
public function updated(CrawlShotJob $crawlShotJob): void
{
// Only fire webhook if status has changed and webhook_url is set
if ($crawlShotJob->isDirty('status') && $crawlShotJob->webhook_url) {
$eventsFilter = $crawlShotJob->webhook_events_filter ?? ['queued', 'processing', 'completed', 'failed'];
// Don't fire webhook if filter is empty array
if (empty($eventsFilter)) {
return;
}
// Only fire webhook if current status is in the filter
if (in_array($crawlShotJob->status, $eventsFilter)) {
WebhookService::send($crawlShotJob);
}
}
}
}

View File

@@ -2,6 +2,8 @@
namespace App\Providers;
use App\Models\CrawlShotJob;
use App\Observers\CrawlShotJobObserver;
use Illuminate\Support\ServiceProvider;
class AppServiceProvider extends ServiceProvider
@@ -19,6 +21,6 @@ public function register(): void
*/
public function boot(): void
{
//
CrawlShotJob::observe(CrawlShotJobObserver::class);
}
}

View File

@@ -44,7 +44,7 @@ public function takeScreenshot(string $url, array $options = []): array
private function configureBrowsershot(string $url, array $options = []): Browsershot
{
$browsershot = Browsershot::url($url)
->waitUntilNetworkIdle()
->waitUntilNetworkIdle() // Always enabled for production to ensure proper rendering
->preventUnsuccessfulResponse();
@@ -52,18 +52,17 @@ private function configureBrowsershot(string $url, array $options = []): Browser
$browsershot->noSandbox();
}
// Basic configuration
if (isset($options['timeout'])) {
$browsershot->timeout($options['timeout']);
}
// Basic configuration with maximum timeout safeguard
$timeout = $options['timeout'] ?? 30;
$maxTimeout = 300; // 5 minutes maximum to prevent indefinite waiting
$browsershot->timeout(min($timeout, $maxTimeout));
if (isset($options['delay'])) {
$browsershot->setDelay($options['delay']);
}
if (isset($options['wait_until_network_idle']) && $options['wait_until_network_idle']) {
$browsershot->waitUntilNetworkIdle();
}
// waitUntilNetworkIdle() is always enabled (configured above on line 47)
// Removed conditional logic as network idle waiting is required for production
// Apply ad/tracker blocking
if (($options['block_ads'] ?? true) || ($options['block_trackers'] ?? true)) {

View File

@@ -26,7 +26,8 @@ public function getBlockedDomains(string $url): array
}
}
return array_slice(array_unique($domains), 0, 100); // Limit to 100 domains
// Limit to 50 most common ad domains to reduce timeout risk
return array_slice(array_unique($domains), 0, 50);
}
public function getBlockedUrls(string $url): array
@@ -43,7 +44,8 @@ public function getBlockedUrls(string $url): array
}
}
return array_slice(array_unique($urls), 0, 50); // Limit to 50 URL patterns
// Limit to 25 URL patterns to reduce blocking overhead
return array_slice(array_unique($urls), 0, 25);
}
private function getFilters(): array

View File

@@ -0,0 +1,62 @@
<?php
namespace App\Services;
use App\Models\CrawlShotJob;
use App\Jobs\RetryWebhookJob;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Log;
class WebhookService
{
private const RETRY_DELAYS = [1, 2, 4, 8, 16, 32]; // minutes
public static function send(CrawlShotJob $job): void
{
try {
$payload = $job->buildStatusResponse();
$response = Http::timeout(5)->post($job->webhook_url, $payload);
if ($response->successful()) {
// Reset webhook error fields on success
$job->update([
'webhook_attempts' => 0,
'webhook_last_error' => null,
'webhook_next_retry_at' => null
]);
} else {
throw new \Exception("HTTP {$response->status()}: {$response->body()}");
}
} catch (\Exception $e) {
self::handleWebhookFailure($job, $e->getMessage());
}
}
private static function handleWebhookFailure(CrawlShotJob $job, string $error): void
{
$currentAttempts = $job->webhook_attempts ?? 0;
if ($currentAttempts < 6) {
$delayMinutes = self::RETRY_DELAYS[$currentAttempts];
$nextRetryAt = now()->addMinutes($delayMinutes);
$job->update([
'webhook_attempts' => $currentAttempts + 1,
'webhook_last_error' => $error,
'webhook_next_retry_at' => $nextRetryAt
]);
// Schedule retry job
RetryWebhookJob::dispatch($job->uuid)->delay($nextRetryAt);
} else {
// Max attempts reached, just update error
$job->update([
'webhook_attempts' => $currentAttempts + 1,
'webhook_last_error' => $error,
'webhook_next_retry_at' => null
]);
}
}
}