This commit is contained in:
ct
2025-08-10 21:10:33 +08:00
parent 480bd9055d
commit 583a804073
43 changed files with 7623 additions and 270 deletions

View File

@@ -0,0 +1,31 @@
<?php
namespace App\Console\Commands;
use App\Models\User;
use Illuminate\Console\Command;
class CreateApiToken extends Command
{
protected $signature = 'crawlshot:create-token {name=API User} {email=api@crawlshot.test}';
protected $description = 'Create an API token for Crawlshot';
public function handle()
{
$name = $this->argument('name');
$email = $this->argument('email');
$user = User::firstOrCreate(['email' => $email], [
'name' => $name,
'password' => bcrypt('password')
]);
$token = $user->createToken('crawlshot-api')->plainTextToken;
$this->info("API Token created successfully!");
$this->line("Token: {$token}");
$this->line("Use this in your Authorization header: Bearer {$token}");
return 0;
}
}

View File

@@ -0,0 +1,100 @@
<?php
namespace App\Console\Commands;
use App\Jobs\CleanupOldResults;
use App\Models\CrawlShotJob;
use Carbon\Carbon;
use Illuminate\Console\Command;
class PruneStorage extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'crawlshot:prune-storage
{--hours=24 : How many hours old files should be to be pruned}
{--dry-run : Show what would be deleted without actually deleting}';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Prune expired crawlshot HTML and image results older than specified hours (default: 24)';
/**
* Execute the console command.
*/
public function handle()
{
$hours = (int) $this->option('hours');
$dryRun = $this->option('dry-run');
$this->info("Pruning crawlshot storage files older than {$hours} hours...");
if ($dryRun) {
$this->warn('DRY RUN MODE - No files will actually be deleted');
}
// Find jobs older than specified hours
$cutoffTime = Carbon::now()->subHours($hours);
$oldJobs = CrawlShotJob::where('created_at', '<', $cutoffTime)
->whereNotNull('file_path')
->get();
if ($oldJobs->isEmpty()) {
$this->info('No files found to prune.');
return Command::SUCCESS;
}
$this->info("Found {$oldJobs->count()} files to prune:");
$deletedFiles = 0;
$deletedRecords = 0;
$errors = 0;
foreach ($oldJobs as $job) {
$this->line("- {$job->type} job {$job->uuid} ({$job->file_path})");
if (!$dryRun) {
// Delete the file if it exists
if ($job->file_path && file_exists($job->file_path)) {
if (unlink($job->file_path)) {
$deletedFiles++;
} else {
$this->error(" Failed to delete file: {$job->file_path}");
$errors++;
}
}
// Delete the database record
try {
$job->delete();
$deletedRecords++;
} catch (\Exception $e) {
$this->error(" Failed to delete database record: {$e->getMessage()}");
$errors++;
}
}
}
if (!$dryRun) {
$this->info("Cleanup completed:");
$this->line(" - Files deleted: {$deletedFiles}");
$this->line(" - Database records deleted: {$deletedRecords}");
if ($errors > 0) {
$this->error(" - Errors encountered: {$errors}");
return Command::FAILURE;
}
} else {
$this->info("Would have deleted {$oldJobs->count()} files and records");
}
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,128 @@
<?php
namespace App\Http\Controllers\Api;
use App\Http\Controllers\Controller;
use App\Models\CrawlShotJob;
use App\Jobs\ProcessCrawlShotJob;
use Illuminate\Http\Request;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Response;
use Illuminate\Support\Str;
use Illuminate\Support\Facades\Storage;
use Illuminate\Validation\Rule;
class CrawlController extends Controller
{
public function crawl(Request $request): JsonResponse
{
$validated = $request->validate([
'url' => 'required|url|max:2048',
'timeout' => 'integer|min:5|max:300',
'delay' => 'integer|min:0|max:30000',
'block_ads' => 'boolean',
'block_cookie_banners' => 'boolean',
'block_trackers' => 'boolean',
'wait_until_network_idle' => 'boolean'
]);
$uuid = Str::uuid()->toString();
$job = CrawlShotJob::create([
'uuid' => $uuid,
'type' => 'crawl',
'url' => $validated['url'],
'status' => 'queued',
'parameters' => array_filter([
'timeout' => $validated['timeout'] ?? 30,
'delay' => $validated['delay'] ?? 0,
'block_ads' => $validated['block_ads'] ?? true,
'block_cookie_banners' => $validated['block_cookie_banners'] ?? true,
'block_trackers' => $validated['block_trackers'] ?? true,
'wait_until_network_idle' => $validated['wait_until_network_idle'] ?? false
])
]);
ProcessCrawlShotJob::dispatch($uuid);
return response()->json([
'uuid' => $uuid,
'status' => 'queued',
'message' => 'Crawl job initiated successfully'
], 201);
}
public function status(string $uuid): JsonResponse
{
$job = CrawlShotJob::where('uuid', $uuid)->first();
if (!$job) {
return response()->json(['error' => 'Job not found'], 404);
}
$response = [
'uuid' => $job->uuid,
'status' => $job->status,
'url' => $job->url,
'created_at' => $job->created_at->toISOString()
];
if ($job->started_at) {
$response['started_at'] = $job->started_at->toISOString();
}
if ($job->completed_at) {
$response['completed_at'] = $job->completed_at->toISOString();
}
if ($job->status === 'completed' && $job->file_path) {
$response['result'] = [
'html' => [
'url' => url("/api/crawl/{$job->uuid}.html"),
'raw' => Storage::get($job->file_path)
]
];
}
if ($job->status === 'failed' && $job->error_message) {
$response['error'] = $job->error_message;
}
return response()->json($response);
}
public function index(): JsonResponse
{
$jobs = CrawlShotJob::where('type', 'crawl')
->orderBy('created_at', 'desc')
->paginate(20);
$response = [
'jobs' => $jobs->items(),
'pagination' => [
'current_page' => $jobs->currentPage(),
'total_pages' => $jobs->lastPage(),
'total_items' => $jobs->total(),
'per_page' => $jobs->perPage()
]
];
return response()->json($response);
}
public function serve(string $uuid): Response
{
$job = CrawlShotJob::where('uuid', $uuid)->where('type', 'crawl')->first();
if (!$job || $job->status !== 'completed') {
return response('HTML file not found or not ready', 404);
}
if (!$job->file_path || !Storage::exists($job->file_path)) {
return response('HTML file not found', 404);
}
return response(Storage::get($job->file_path))
->header('Content-Type', 'text/html; charset=utf-8');
}
}

View File

@@ -0,0 +1,151 @@
<?php
namespace App\Http\Controllers\Api;
use App\Http\Controllers\Controller;
use App\Models\CrawlShotJob;
use App\Jobs\ProcessCrawlShotJob;
use Illuminate\Http\Request;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Response;
use Illuminate\Support\Str;
use Illuminate\Support\Facades\Storage;
class ShotController extends Controller
{
public function shot(Request $request): JsonResponse
{
$validated = $request->validate([
'url' => 'required|url|max:2048',
'viewport_width' => 'integer|min:320|max:3840',
'viewport_height' => 'integer|min:240|max:2160',
'quality' => 'integer|min:1|max:100',
'timeout' => 'integer|min:5|max:300',
'delay' => 'integer|min:0|max:30000',
'block_ads' => 'boolean',
'block_cookie_banners' => 'boolean',
'block_trackers' => 'boolean'
]);
$uuid = Str::uuid()->toString();
$job = CrawlShotJob::create([
'uuid' => $uuid,
'type' => 'shot',
'url' => $validated['url'],
'status' => 'queued',
'parameters' => array_filter([
'viewport_width' => $validated['viewport_width'] ?? 1920,
'viewport_height' => $validated['viewport_height'] ?? 1080,
'format' => 'webp', // Force WebP for all screenshots
'quality' => $validated['quality'] ?? 90,
'timeout' => $validated['timeout'] ?? 30,
'delay' => $validated['delay'] ?? 0,
'block_ads' => $validated['block_ads'] ?? true,
'block_cookie_banners' => $validated['block_cookie_banners'] ?? true,
'block_trackers' => $validated['block_trackers'] ?? true
])
]);
ProcessCrawlShotJob::dispatch($uuid);
return response()->json([
'uuid' => $uuid,
'status' => 'queued',
'message' => 'Screenshot job initiated successfully'
], 201);
}
public function status(string $uuid): JsonResponse
{
$job = CrawlShotJob::where('uuid', $uuid)->first();
if (!$job) {
return response()->json(['error' => 'Job not found'], 404);
}
$response = [
'uuid' => $job->uuid,
'status' => $job->status,
'url' => $job->url,
'created_at' => $job->created_at->toISOString()
];
if ($job->started_at) {
$response['started_at'] = $job->started_at->toISOString();
}
if ($job->completed_at) {
$response['completed_at'] = $job->completed_at->toISOString();
}
if ($job->status === 'completed' && $job->file_path) {
$imageData = Storage::get($job->file_path);
$response['result'] = [
'image' => [
'url' => url("/api/shot/{$job->uuid}.webp"),
'raw' => base64_encode($imageData),
],
'mime_type' => 'image/webp',
'format' => 'webp',
'width' => $job->parameters['viewport_width'] ?? 1920,
'height' => $job->parameters['viewport_height'] ?? 1080,
'size' => strlen($imageData)
];
}
if ($job->status === 'failed' && $job->error_message) {
$response['error'] = $job->error_message;
}
return response()->json($response);
}
public function serve(string $uuid): Response
{
$job = CrawlShotJob::where('uuid', $uuid)->where('type', 'shot')->first();
if (!$job || $job->status !== 'completed') {
return response('Screenshot not found or not ready', 404);
}
if (!$job->file_path || !Storage::exists($job->file_path)) {
return response('Screenshot file not found', 404);
}
// Always serve as WebP
return response(Storage::get($job->file_path))
->header('Content-Type', 'image/webp');
}
public function index(): JsonResponse
{
$jobs = CrawlShotJob::where('type', 'shot')
->orderBy('created_at', 'desc')
->paginate(20);
$response = [
'jobs' => $jobs->items(),
'pagination' => [
'current_page' => $jobs->currentPage(),
'total_pages' => $jobs->lastPage(),
'total_items' => $jobs->total(),
'per_page' => $jobs->perPage()
]
];
return response()->json($response);
}
private function getMimeType(string $format): string
{
$mimeTypes = [
'jpg' => 'image/jpeg',
'png' => 'image/png',
'webp' => 'image/webp'
];
return $mimeTypes[$format] ?? 'image/webp';
}
}

View File

@@ -0,0 +1,43 @@
<?php
namespace App\Jobs;
use App\Models\CrawlShotJob;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Facades\Log;
class CleanupOldResults implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
public function handle(): void
{
$cutoffTime = now()->subHours(24);
$oldJobs = CrawlShotJob::where('created_at', '<', $cutoffTime)->get();
$deletedFiles = 0;
$deletedJobs = 0;
foreach ($oldJobs as $job) {
if ($job->file_path && Storage::exists($job->file_path)) {
Storage::delete($job->file_path);
$deletedFiles++;
}
$job->delete();
$deletedJobs++;
}
Log::info("Cleanup completed", [
'deleted_files' => $deletedFiles,
'deleted_jobs' => $deletedJobs,
'cutoff_time' => $cutoffTime->toISOString()
]);
}
}

View File

@@ -0,0 +1,88 @@
<?php
namespace App\Jobs;
use App\Models\CrawlShotJob;
use App\Services\BrowsershotService;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
class ProcessCrawlShotJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected string $jobUuid;
public function __construct(string $jobUuid)
{
$this->jobUuid = $jobUuid;
}
public function handle(): void
{
$job = CrawlShotJob::where('uuid', $this->jobUuid)->first();
if (!$job) {
Log::error("CrawlShotJob not found: {$this->jobUuid}");
return;
}
try {
$job->update([
'status' => 'processing',
'started_at' => now()
]);
$browsershot = new BrowsershotService();
if ($job->type === 'crawl') {
$result = $browsershot->crawlHtml($job->url, $job->parameters ?? []);
$this->saveCrawlResult($job, $result);
} elseif ($job->type === 'shot') {
$result = $browsershot->takeScreenshot($job->url, $job->parameters ?? []);
$this->saveScreenshotResult($job, $result);
}
$job->update([
'status' => 'completed',
'completed_at' => now()
]);
} catch (\Exception $e) {
Log::error("Job {$this->jobUuid} failed: " . $e->getMessage());
$job->update([
'status' => 'failed',
'error_message' => $e->getMessage(),
'completed_at' => now()
]);
}
}
private function saveCrawlResult(CrawlShotJob $job, string $html): void
{
$filename = "{$job->uuid}.html";
$path = "crawlshot/html/{$filename}";
Storage::put($path, $html);
$job->update(['file_path' => $path]);
}
private function saveScreenshotResult(CrawlShotJob $job, array $result): void
{
$parameters = $job->parameters ?? [];
$format = $parameters['format'] ?? 'jpg';
$filename = "{$job->uuid}.{$format}";
$path = "crawlshot/images/{$filename}";
Storage::put($path, $result['data']);
$job->update(['file_path' => $path]);
}
}

View File

@@ -0,0 +1,34 @@
<?php
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class CrawlShotJob extends Model
{
use HasFactory;
protected $fillable = [
'uuid',
'type',
'url',
'status',
'parameters',
'file_path',
'error_message',
'started_at',
'completed_at'
];
protected $casts = [
'parameters' => 'array',
'started_at' => 'datetime',
'completed_at' => 'datetime'
];
public function getRouteKeyName()
{
return 'uuid';
}
}

View File

@@ -6,11 +6,12 @@
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Foundation\Auth\User as Authenticatable;
use Illuminate\Notifications\Notifiable;
use Laravel\Sanctum\HasApiTokens;
class User extends Authenticatable
{
/** @use HasFactory<\Database\Factories\UserFactory> */
use HasFactory, Notifiable;
use HasFactory, Notifiable, HasApiTokens;
/**
* The attributes that are mass assignable.

View File

@@ -0,0 +1,36 @@
<?php
namespace App\Providers;
use Illuminate\Support\Facades\Gate;
use Laravel\Horizon\Horizon;
use Laravel\Horizon\HorizonApplicationServiceProvider;
class HorizonServiceProvider extends HorizonApplicationServiceProvider
{
/**
* Bootstrap any application services.
*/
public function boot(): void
{
parent::boot();
// Horizon::routeSmsNotificationsTo('15556667777');
// Horizon::routeMailNotificationsTo('example@example.com');
// Horizon::routeSlackNotificationsTo('slack-webhook-url', '#channel');
}
/**
* Register the Horizon gate.
*
* This gate determines who can access Horizon in non-local environments.
*/
protected function gate(): void
{
Gate::define('viewHorizon', function ($user = null) {
return in_array(optional($user)->email, [
//
]);
});
}
}

View File

@@ -0,0 +1,76 @@
<?php
namespace App\Services;
use Spatie\Browsershot\Browsershot;
class BrowsershotService
{
public function crawlHtml(string $url, array $options = []): string
{
$browsershot = $this->configureBrowsershot($url, $options);
return $browsershot->bodyHtml();
}
public function takeScreenshot(string $url, array $options = []): array
{
$browsershot = $this->configureBrowsershot($url, $options);
// Configure viewport for screenshots
$width = $options['viewport_width'] ?? 1920;
$height = $options['viewport_height'] ?? 1080;
$browsershot->windowSize($width, $height);
// Always use WebP format
$quality = $options['quality'] ?? 90;
$browsershot->setScreenshotType('webp', $quality);
$tempPath = storage_path("temp_screenshot_webp." . time() . '.webp');
$browsershot->save($tempPath);
$imageData = file_get_contents($tempPath);
unlink($tempPath);
return [
'data' => $imageData,
'mime_type' => 'image/webp',
'width' => $width,
'height' => $height
];
}
private function configureBrowsershot(string $url, array $options = []): Browsershot
{
$browsershot = Browsershot::url($url);
// Basic configuration
if (isset($options['timeout'])) {
$browsershot->timeout($options['timeout']);
}
if (isset($options['delay'])) {
$browsershot->setDelay($options['delay']);
}
if (isset($options['wait_until_network_idle']) && $options['wait_until_network_idle']) {
$browsershot->waitUntilNetworkIdle();
}
// Apply ad/tracker blocking
if (($options['block_ads'] ?? true) || ($options['block_trackers'] ?? true)) {
$easyListService = new EasyListService();
$blockedDomains = $easyListService->getBlockedDomains($url);
$blockedUrls = $easyListService->getBlockedUrls($url);
if (!empty($blockedDomains)) {
$browsershot->blockDomains($blockedDomains);
}
if (!empty($blockedUrls)) {
$browsershot->blockUrls($blockedUrls);
}
}
return $browsershot;
}
}

View File

@@ -0,0 +1,100 @@
<?php
namespace App\Services;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Log;
class EasyListService
{
private const EASYLIST_URL = 'https://easylist.to/easylist/easylist.txt';
private const CACHE_KEY = 'easylist_filters';
private const CACHE_TTL = 86400; // 24 hours
public function getBlockedDomains(string $url): array
{
$filters = $this->getFilters();
$domains = [];
foreach ($filters as $filter) {
if (strpos($filter, '||') === 0 && strpos($filter, '^') !== false) {
$domain = trim(str_replace(['||', '^'], '', $filter));
if ($this->isValidDomain($domain)) {
$domains[] = $domain;
}
}
}
return array_slice(array_unique($domains), 0, 100); // Limit to 100 domains
}
public function getBlockedUrls(string $url): array
{
$filters = $this->getFilters();
$urls = [];
foreach ($filters as $filter) {
if (strpos($filter, '||') !== 0 && strpos($filter, '#') !== 0 && strpos($filter, '!') !== 0) {
$cleanFilter = trim($filter);
if (strlen($cleanFilter) > 3 && strpos($cleanFilter, '*') !== false) {
$urls[] = str_replace('*', '', $cleanFilter);
}
}
}
return array_slice(array_unique($urls), 0, 50); // Limit to 50 URL patterns
}
private function getFilters(): array
{
return Cache::remember(self::CACHE_KEY, self::CACHE_TTL, function () {
try {
$response = Http::timeout(30)->get(self::EASYLIST_URL);
if ($response->successful()) {
$content = $response->body();
$lines = explode("\n", $content);
$filters = [];
foreach ($lines as $line) {
$line = trim($line);
if (!empty($line) && strpos($line, '!') !== 0) {
$filters[] = $line;
}
}
Log::info('EasyList filters updated', ['count' => count($filters)]);
return $filters;
}
Log::warning('Failed to fetch EasyList filters');
return $this->getFallbackFilters();
} catch (\Exception $e) {
Log::error('Error fetching EasyList filters: ' . $e->getMessage());
return $this->getFallbackFilters();
}
});
}
private function getFallbackFilters(): array
{
return [
'||googletagmanager.com^',
'||google-analytics.com^',
'||facebook.com/tr^',
'||doubleclick.net^',
'||googlesyndication.com^',
'||amazon-adsystem.com^',
'||adsystem.amazon.com^',
'||googlesyndication.com^',
'||googleadservices.com^'
];
}
private function isValidDomain(string $domain): bool
{
return filter_var($domain, FILTER_VALIDATE_DOMAIN) !== false;
}
}