Update
This commit is contained in:
@@ -1,6 +1,23 @@
|
||||
# Crawlshot API Documentation
|
||||
|
||||
Crawlshot is a self-hosted web crawling and screenshot service built with Laravel and Spatie Browsershot. This API provides endpoints for capturing web content and generating screenshots with advanced filtering capabilities.
|
||||
Crawlshot is a self-hosted web crawling and screenshot service built with Laravel and Spatie Browsershot. This comprehensive API provides endpoints for capturing web content and generating screenshots with advanced filtering capabilities, webhook notifications, and intelligent retry mechanisms.
|
||||
|
||||
## Overview
|
||||
|
||||
**Core Capabilities:**
|
||||
- **HTML Crawling**: Extract clean HTML content from web pages with ad/tracker blocking
|
||||
- **Screenshot Capture**: Generate high-quality WebP screenshots with optimizable quality settings
|
||||
- **Webhook Notifications**: Real-time status updates with event filtering and progressive retry
|
||||
- **Background Processing**: Asynchronous job processing via Laravel Horizon
|
||||
- **Smart Filtering**: EasyList integration for ad/tracker/cookie banner blocking
|
||||
- **Auto-cleanup**: 24-hour file retention with automated cleanup
|
||||
|
||||
**Perfect for:**
|
||||
- Content extraction and monitoring
|
||||
- Website screenshot automation
|
||||
- Quality assurance and testing
|
||||
- Social media preview generation
|
||||
- Compliance and archival systems
|
||||
|
||||
## Base URL
|
||||
|
||||
@@ -8,21 +25,50 @@ ## Base URL
|
||||
https://crawlshot.test
|
||||
```
|
||||
|
||||
## Authentication
|
||||
Replace `crawlshot.test` with your actual Crawlshot service URL.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Authentication
|
||||
|
||||
All API endpoints (except health check) require authentication using Laravel Sanctum API tokens.
|
||||
|
||||
### Authentication Header
|
||||
|
||||
**Authentication Header:**
|
||||
```http
|
||||
Authorization: Bearer {your-api-token}
|
||||
```
|
||||
|
||||
### Example API Token
|
||||
**Example API Token:**
|
||||
```
|
||||
1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c
|
||||
```
|
||||
|
||||
### 2. Your First API Call
|
||||
|
||||
**Simple HTML Crawl:**
|
||||
```bash
|
||||
curl -X POST "https://crawlshot.test/api/crawl" \
|
||||
-H "Authorization: Bearer YOUR_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com"}'
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
|
||||
"status": "queued",
|
||||
"message": "Crawl job initiated successfully"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Check Job Status
|
||||
|
||||
```bash
|
||||
curl -H "Authorization: Bearer YOUR_TOKEN" \
|
||||
"https://crawlshot.test/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Health Check
|
||||
@@ -70,10 +116,12 @@ #### Request Parameters
|
||||
| `block_ads` | boolean | ❌ | true | Block ads using EasyList filters |
|
||||
| `block_cookie_banners` | boolean | ❌ | true | Block cookie consent banners |
|
||||
| `block_trackers` | boolean | ❌ | true | Block tracking scripts |
|
||||
| `wait_until_network_idle` | boolean | ❌ | false | Wait for network activity to cease |
|
||||
| `webhook_url` | string | ❌ | null | URL to receive job status webhooks (max 2048 chars) |
|
||||
| `webhook_events_filter` | array | ❌ | `["queued","processing","completed","failed"]` | Which job statuses trigger webhooks. Empty array `[]` disables webhooks |
|
||||
|
||||
#### Request Example
|
||||
#### Request Examples
|
||||
|
||||
**Basic Crawl:**
|
||||
```bash
|
||||
curl -X POST "https://crawlshot.test/api/crawl" \
|
||||
-H "Accept: application/json" \
|
||||
@@ -85,8 +133,22 @@ #### Request Example
|
||||
"delay": 2000,
|
||||
"block_ads": true,
|
||||
"block_cookie_banners": true,
|
||||
"block_trackers": true,
|
||||
"wait_until_network_idle": true
|
||||
"block_trackers": true
|
||||
}'
|
||||
```
|
||||
|
||||
**With Webhook Notifications:**
|
||||
```bash
|
||||
curl -X POST "https://crawlshot.test/api/crawl" \
|
||||
-H "Accept: application/json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
"webhook_url": "https://myapp.com/webhooks/crawlshot",
|
||||
"webhook_events_filter": ["completed", "failed"],
|
||||
"block_ads": true,
|
||||
"timeout": 60
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -227,16 +289,18 @@ #### Request Parameters
|
||||
| `url` | string | ✅ | - | Target URL to screenshot (max 2048 chars) |
|
||||
| `viewport_width` | integer | ❌ | 1920 | Viewport width in pixels (320-3840) |
|
||||
| `viewport_height` | integer | ❌ | 1080 | Viewport height in pixels (240-2160) |
|
||||
| `format` | string | ❌ | "jpg" | Image format: "jpg", "png", "webp" |
|
||||
| `quality` | integer | ❌ | 90 | Image quality 1-100 (for JPEG/WebP) |
|
||||
| `quality` | integer | ❌ | 90 | Image quality 1-100 (always WebP format) |
|
||||
| `timeout` | integer | ❌ | 30 | Request timeout in seconds (5-300) |
|
||||
| `delay` | integer | ❌ | 0 | Wait time before capture in milliseconds (0-30000) |
|
||||
| `block_ads` | boolean | ❌ | true | Block ads using EasyList filters |
|
||||
| `block_cookie_banners` | boolean | ❌ | true | Block cookie consent banners |
|
||||
| `block_trackers` | boolean | ❌ | true | Block tracking scripts |
|
||||
| `webhook_url` | string | ❌ | null | URL to receive job status webhooks (max 2048 chars) |
|
||||
| `webhook_events_filter` | array | ❌ | `["queued","processing","completed","failed"]` | Which job statuses trigger webhooks. Empty array `[]` disables webhooks |
|
||||
|
||||
#### Request Example
|
||||
#### Request Examples
|
||||
|
||||
**Basic Screenshot:**
|
||||
```bash
|
||||
curl -X POST "https://crawlshot.test/api/shot" \
|
||||
-H "Accept: application/json" \
|
||||
@@ -246,7 +310,6 @@ #### Request Example
|
||||
"url": "https://example.com",
|
||||
"viewport_width": 1920,
|
||||
"viewport_height": 1080,
|
||||
"format": "webp",
|
||||
"quality": 90,
|
||||
"timeout": 30,
|
||||
"delay": 2000,
|
||||
@@ -256,6 +319,22 @@ #### Request Example
|
||||
}'
|
||||
```
|
||||
|
||||
**With Webhook Notifications:**
|
||||
```bash
|
||||
curl -X POST "https://crawlshot.test/api/shot" \
|
||||
-H "Accept: application/json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
"webhook_url": "https://myapp.com/webhooks/crawlshot",
|
||||
"webhook_events_filter": ["completed"],
|
||||
"viewport_width": 1200,
|
||||
"viewport_height": 800,
|
||||
"quality": 85
|
||||
}'
|
||||
```
|
||||
|
||||
#### Response Example
|
||||
|
||||
```json
|
||||
@@ -369,10 +448,8 @@ #### Request Example
|
||||
|
||||
#### Response
|
||||
|
||||
Returns the image file directly with appropriate `Content-Type` headers:
|
||||
- `Content-Type: image/jpeg` for JPEG files
|
||||
- `Content-Type: image/png` for PNG files
|
||||
- `Content-Type: image/webp` for WebP files
|
||||
Returns the WebP image file directly with appropriate headers:
|
||||
- `Content-Type: image/webp`
|
||||
|
||||
---
|
||||
|
||||
@@ -415,6 +492,217 @@ #### Response Example
|
||||
|
||||
---
|
||||
|
||||
## Webhook System
|
||||
|
||||
Crawlshot supports real-time webhook notifications to keep your application informed about job status changes without constant polling.
|
||||
|
||||
### How Webhooks Work
|
||||
|
||||
1. **Configure Webhook**: Include `webhook_url` when creating jobs
|
||||
2. **Filter Events**: Use `webhook_events_filter` to specify which status changes trigger webhooks
|
||||
3. **Receive Notifications**: Your endpoint receives HTTP POST requests with job status data
|
||||
4. **Automatic Retries**: Failed webhooks are automatically retried with progressive backoff
|
||||
|
||||
### Event Filtering
|
||||
|
||||
Control which job status changes trigger webhook calls:
|
||||
|
||||
```json
|
||||
{
|
||||
"webhook_events_filter": ["completed", "failed"]
|
||||
}
|
||||
```
|
||||
|
||||
**Available Events:**
|
||||
- `queued` - Job created and queued for processing
|
||||
- `processing` - Job started processing
|
||||
- `completed` - Job finished successfully
|
||||
- `failed` - Job encountered an error
|
||||
|
||||
**Special Behaviors:**
|
||||
- **Default**: `["queued", "processing", "completed", "failed"]` (all events)
|
||||
- **Disable**: `[]` (empty array disables webhooks entirely)
|
||||
- **Omitted**: Same as default (all events)
|
||||
|
||||
### Webhook Payload
|
||||
|
||||
Webhooks send the **exact same payload** as the status endpoints (`GET /api/crawl/{uuid}` or `GET /api/shot/{uuid}`), ensuring consistency.
|
||||
|
||||
**Crawl Webhook Example:**
|
||||
```json
|
||||
{
|
||||
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
|
||||
"status": "completed",
|
||||
"url": "https://example.com",
|
||||
"created_at": "2025-08-10T10:00:42.000000Z",
|
||||
"started_at": "2025-08-10T10:00:45.000000Z",
|
||||
"completed_at": "2025-08-10T10:01:12.000000Z",
|
||||
"result": {
|
||||
"html": {
|
||||
"url": "https://crawlshot.test/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb.html",
|
||||
"raw": "<!doctype html>\n<html>..."
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Screenshot Webhook Example:**
|
||||
```json
|
||||
{
|
||||
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
|
||||
"status": "completed",
|
||||
"url": "https://example.com",
|
||||
"created_at": "2025-08-10T10:05:42.000000Z",
|
||||
"started_at": "2025-08-10T10:05:45.000000Z",
|
||||
"completed_at": "2025-08-10T10:06:12.000000Z",
|
||||
"result": {
|
||||
"image": {
|
||||
"url": "https://crawlshot.test/api/shot/fe37d511-99cb-4295-853b-6d484900a851.webp",
|
||||
"raw": "iVBORw0KGgoAAAANSUhEUgAAAHg..."
|
||||
},
|
||||
"mime_type": "image/webp",
|
||||
"format": "webp",
|
||||
"width": 1920,
|
||||
"height": 1080,
|
||||
"size": 45678
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Progressive Retry System
|
||||
|
||||
Failed webhook deliveries are automatically retried with exponential backoff:
|
||||
|
||||
- **1st retry**: 1 minute after failure
|
||||
- **2nd retry**: 2 minutes after failure
|
||||
- **3rd retry**: 4 minutes after failure
|
||||
- **4th retry**: 8 minutes after failure
|
||||
- **5th retry**: 16 minutes after failure
|
||||
- **6th retry**: 32 minutes after failure
|
||||
- **After 6 failures**: Stops retrying, webhook marked as failed
|
||||
|
||||
**Total retry window**: ~63 minutes (1+2+4+8+16+32)
|
||||
|
||||
### Webhook Requirements
|
||||
|
||||
**Your webhook endpoint should:**
|
||||
- Accept HTTP POST requests
|
||||
- Return HTTP 2xx status codes for successful processing
|
||||
- Respond within 5 seconds (webhook timeout)
|
||||
- Handle duplicate deliveries gracefully (use job UUID for idempotency)
|
||||
|
||||
**Example webhook handler (PHP):**
|
||||
```php
|
||||
Route::post('/webhooks/crawlshot', function (Request $request) {
|
||||
$jobData = $request->all();
|
||||
|
||||
// Process the job status update
|
||||
if ($jobData['status'] === 'completed') {
|
||||
// Handle successful completion
|
||||
$result = $jobData['result'];
|
||||
} elseif ($jobData['status'] === 'failed') {
|
||||
// Handle failure
|
||||
$error = $jobData['error'];
|
||||
}
|
||||
|
||||
return response('OK', 200);
|
||||
});
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Webhook Error Management
|
||||
|
||||
When webhooks fail, you can manage them through dedicated endpoints.
|
||||
|
||||
### GET `/api/webhook-errors`
|
||||
|
||||
List all jobs with failed webhook deliveries.
|
||||
|
||||
**Authentication:** Required
|
||||
|
||||
#### Request Example
|
||||
|
||||
```bash
|
||||
curl -X GET "https://crawlshot.test/api/webhook-errors" \
|
||||
-H "Accept: application/json" \
|
||||
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
|
||||
```
|
||||
|
||||
#### Response Example
|
||||
|
||||
```json
|
||||
{
|
||||
"jobs": [
|
||||
{
|
||||
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
|
||||
"type": "crawl",
|
||||
"url": "https://example.com",
|
||||
"status": "completed",
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_attempts": 6,
|
||||
"webhook_last_error": "Connection timeout",
|
||||
"webhook_next_retry_at": null,
|
||||
"created_at": "2025-08-10T10:00:42.000000Z"
|
||||
}
|
||||
],
|
||||
"pagination": {
|
||||
"current_page": 1,
|
||||
"total_pages": 1,
|
||||
"total_items": 1,
|
||||
"per_page": 20
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### POST `/api/webhook-errors/{uuid}/retry`
|
||||
|
||||
Manually retry a failed webhook immediately.
|
||||
|
||||
**Authentication:** Required
|
||||
|
||||
#### Request Example
|
||||
|
||||
```bash
|
||||
curl -X POST "https://crawlshot.test/api/webhook-errors/b5dc483b-f62d-4e40-8b9e-4715324a8cbb/retry" \
|
||||
-H "Accept: application/json" \
|
||||
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
|
||||
```
|
||||
|
||||
#### Response Example
|
||||
|
||||
```json
|
||||
{
|
||||
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
|
||||
"message": "Webhook retry attempted"
|
||||
}
|
||||
```
|
||||
|
||||
### DELETE `/api/webhook-errors/{uuid}/clear`
|
||||
|
||||
Clear webhook error status without retrying.
|
||||
|
||||
**Authentication:** Required
|
||||
|
||||
#### Request Example
|
||||
|
||||
```bash
|
||||
curl -X DELETE "https://crawlshot.test/api/webhook-errors/b5dc483b-f62d-4e40-8b9e-4715324a8cbb/clear" \
|
||||
-H "Accept: application/json" \
|
||||
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
|
||||
```
|
||||
|
||||
#### Response Example
|
||||
|
||||
```json
|
||||
{
|
||||
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
|
||||
"message": "Webhook error cleared"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Job Status Flow
|
||||
|
||||
Both crawl and screenshot jobs follow the same status progression:
|
||||
@@ -464,9 +752,9 @@ ### Ad & Tracker Blocking
|
||||
- **Custom Domain Blocking**: Blocks common advertising and tracking domains
|
||||
|
||||
### Image Processing
|
||||
- **Multiple Formats**: Support for JPEG, PNG, and WebP
|
||||
- **WebP Format**: High-quality WebP screenshots with optimizable compression
|
||||
- **Quality Control**: Adjustable compression quality (1-100)
|
||||
- **Imagick Integration**: High-quality image processing and format conversion
|
||||
- **Efficient Processing**: Optimized WebP encoding for fast delivery
|
||||
- **Responsive Sizing**: Custom viewport dimensions up to 4K resolution
|
||||
|
||||
### Storage & Cleanup
|
||||
|
||||
23
CLAUDE.md
23
CLAUDE.md
@@ -57,8 +57,7 @@ ### Supported Parameters (mapped to Browsershot capabilities)
|
||||
- `url`: Target URL to screenshot
|
||||
- `viewport_width`: Viewport width (via `windowSize()` method)
|
||||
- `viewport_height`: Viewport height (via `windowSize()` method)
|
||||
- `format`: jpg, png, webp (via Imagick post-processing)
|
||||
- `quality`: Image quality 1-100 for JPEG (via `setScreenshotType('jpeg', quality)`)
|
||||
- `quality`: WebP image quality 1-100 (via `setScreenshotType('webp', quality)`)
|
||||
- `block_ads`: true/false - Uses EasyList filter for ad blocking
|
||||
- `block_cookie_banners`: true/false - Uses cookie banner blocking patterns
|
||||
- `block_trackers`: true/false - Uses tracker blocking patterns
|
||||
@@ -164,7 +163,7 @@ ### Directory Structure
|
||||
|
||||
storage/app/crawlshot/ # Temporary result storage (24h TTL)
|
||||
├── html/ # HTML crawl results
|
||||
└── images/ # Screenshot files (JPEG/PNG/WebP)
|
||||
└── images/ # Screenshot files (.webp)
|
||||
|
||||
routes/
|
||||
└── api.php # /crawl endpoints with Sanctum auth
|
||||
@@ -173,10 +172,10 @@ ### Directory Structure
|
||||
### Browsershot Configuration
|
||||
|
||||
```php
|
||||
// Basic screenshot configuration with EasyList ad blocking
|
||||
// Basic screenshot configuration with EasyList ad blocking
|
||||
$browsershot = Browsershot::url($url)
|
||||
->windowSize($width, $height)
|
||||
->setScreenshotType('png') // Save as PNG first for Imagick processing
|
||||
->setScreenshotType('webp', $quality) // Always WebP format
|
||||
->setDelay($delayInMs)
|
||||
->waitUntilNetworkIdle()
|
||||
->timeout($timeoutInSeconds);
|
||||
@@ -188,17 +187,9 @@ ### Browsershot Configuration
|
||||
$browsershot->blockDomains($blockedDomains)->blockUrls($blockedUrls);
|
||||
}
|
||||
|
||||
$tempPath = storage_path('temp_screenshot.png');
|
||||
$tempPath = storage_path('temp_screenshot.webp');
|
||||
$browsershot->save($tempPath);
|
||||
|
||||
// Convert to desired format using Imagick if needed
|
||||
if ($format === 'webp') {
|
||||
$imagick = new Imagick($tempPath);
|
||||
$imagick->setImageFormat('webp');
|
||||
$imagick->writeImage($finalPath);
|
||||
unlink($tempPath);
|
||||
}
|
||||
|
||||
// HTML crawling configuration with EasyList filtering
|
||||
$browsershot = Browsershot::url($url)
|
||||
->setDelay($delayInMs)
|
||||
@@ -225,7 +216,7 @@ ### Job States
|
||||
### Storage Strategy
|
||||
|
||||
- HTML results: `storage/app/crawlshot/html/{uuid}.html`
|
||||
- Image results: `storage/app/crawlshot/images/{uuid}.jpg`, `.png`, or `.webp`
|
||||
- Image results: `storage/app/crawlshot/images/{uuid}.webp` (WebP format only)
|
||||
- Auto-cleanup scheduled job removes files after 24 hours
|
||||
- Database tracks job metadata and file paths
|
||||
|
||||
@@ -238,7 +229,7 @@ ### Authentication & Security
|
||||
|
||||
### System Requirements
|
||||
|
||||
- PHP 8.3+ with extensions: gd, imagick (required for WebP format)
|
||||
- PHP 8.3+ with extensions: gd (WebP support built into Puppeteer)
|
||||
- Node.js and npm for Puppeteer
|
||||
- Chrome/Chromium browser (headless)
|
||||
- Sufficient disk space for temporary file storage
|
||||
|
||||
737
CLIENT_DOCUMENTATION.md
Normal file
737
CLIENT_DOCUMENTATION.md
Normal file
@@ -0,0 +1,737 @@
|
||||
# Crawlshot PHP Client Library Documentation
|
||||
|
||||
The Crawlshot PHP Client Library provides a clean, fluent interface for interacting with Crawlshot API services. Designed specifically for Laravel applications, it offers typed responses, method chaining, and comprehensive webhook support.
|
||||
|
||||
## Installation & Setup
|
||||
|
||||
### 1. Install via Composer
|
||||
|
||||
```bash
|
||||
composer require crawlshot/laravel
|
||||
```
|
||||
|
||||
### 2. Configuration
|
||||
|
||||
**Option A: Direct instantiation**
|
||||
```php
|
||||
use Crawlshot\Laravel\CrawlshotClient;
|
||||
|
||||
$client = new CrawlshotClient('https://crawlshot.test', 'your-api-token');
|
||||
```
|
||||
|
||||
**Option B: Environment variables (recommended)**
|
||||
```php
|
||||
# .env
|
||||
CRAWLSHOT_BASE_URL=https://crawlshot.test
|
||||
CRAWLSHOT_TOKEN=1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c
|
||||
|
||||
# In your code
|
||||
$client = new CrawlshotClient(
|
||||
env('CRAWLSHOT_BASE_URL'),
|
||||
env('CRAWLSHOT_TOKEN')
|
||||
);
|
||||
```
|
||||
|
||||
### 3. Service Provider (Optional)
|
||||
|
||||
For application-wide configuration, create a service provider:
|
||||
|
||||
```php
|
||||
// app/Providers/CrawlshotServiceProvider.php
|
||||
class CrawlshotServiceProvider extends ServiceProvider
|
||||
{
|
||||
public function register()
|
||||
{
|
||||
$this->app->singleton(CrawlshotClient::class, function ($app) {
|
||||
return new CrawlshotClient(
|
||||
config('services.crawlshot.base_url'),
|
||||
config('services.crawlshot.token')
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// config/services.php
|
||||
'crawlshot' => [
|
||||
'base_url' => env('CRAWLSHOT_BASE_URL'),
|
||||
'token' => env('CRAWLSHOT_TOKEN'),
|
||||
],
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Basic Usage
|
||||
|
||||
### Simple HTML Crawling
|
||||
|
||||
```php
|
||||
use Crawlshot\Laravel\CrawlshotClient;
|
||||
|
||||
$client = new CrawlshotClient('https://crawlshot.test', 'your-token');
|
||||
|
||||
// Create crawl job
|
||||
$response = $client->createCrawl('https://example.com');
|
||||
echo "Job UUID: " . $response['uuid']; // Raw array response
|
||||
|
||||
// Check status
|
||||
$status = $client->getCrawlStatus($response['uuid']);
|
||||
echo "Status: " . $status->getStatus(); // Typed response object
|
||||
|
||||
if ($status->isCompleted()) {
|
||||
$html = $status->getResultRaw();
|
||||
echo "HTML content: " . substr($html, 0, 200) . "...";
|
||||
}
|
||||
```
|
||||
|
||||
### Simple Screenshot Capture
|
||||
|
||||
```php
|
||||
// Create screenshot job
|
||||
$response = $client->createShot('https://example.com');
|
||||
|
||||
// Check status
|
||||
$status = $client->getShotStatus($response['uuid']);
|
||||
|
||||
if ($status->isCompleted()) {
|
||||
echo "Format: " . $status->getFormat(); // webp
|
||||
echo "Size: " . implode('x', $status->getDimensions()); // [1920, 1080]
|
||||
|
||||
// Get image data
|
||||
$imageData = $status->getImageData(); // base64
|
||||
$imageFile = $status->downloadImage(); // binary data
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Fluent Interface
|
||||
|
||||
The client provides a powerful fluent interface for building complex requests with method chaining.
|
||||
|
||||
### Fluent HTML Crawling
|
||||
|
||||
```php
|
||||
$crawl = $client->crawl('https://example.com')
|
||||
->timeout(60)
|
||||
->delay(2000)
|
||||
->blockAds(true)
|
||||
->blockCookieBanners(true)
|
||||
->blockTrackers(true)
|
||||
->waitUntilNetworkIdle(true)
|
||||
->webhookUrl('https://myapp.com/webhooks/crawlshot')
|
||||
->webhookEventsFilter(['completed', 'failed'])
|
||||
->create(); // Returns CrawlResponse
|
||||
|
||||
echo "Job created: " . $crawl->getUuid();
|
||||
echo "Status: " . $crawl->getStatus();
|
||||
|
||||
// Wait for completion
|
||||
while ($crawl->isProcessing() || $crawl->isQueued()) {
|
||||
sleep(2);
|
||||
$crawl->refresh(); // Updates from API
|
||||
}
|
||||
|
||||
if ($crawl->isCompleted()) {
|
||||
$html = $crawl->getResultRaw();
|
||||
file_put_contents('page.html', $html);
|
||||
}
|
||||
```
|
||||
|
||||
### Fluent Screenshot Capture
|
||||
|
||||
```php
|
||||
$screenshot = $client->shot('https://example.com')
|
||||
->viewportSize(1200, 800)
|
||||
->quality(85)
|
||||
->timeout(30)
|
||||
->delay(1000)
|
||||
->blockAds(true)
|
||||
->webhookUrl('https://myapp.com/webhooks/crawlshot')
|
||||
->webhookEventsFilter(['completed'])
|
||||
->create(); // Returns ShotResponse
|
||||
|
||||
echo "Screenshot job: " . $screenshot->getUuid();
|
||||
|
||||
// Poll until complete
|
||||
while (!$screenshot->isCompleted() && !$screenshot->isFailed()) {
|
||||
sleep(3);
|
||||
$screenshot->refresh();
|
||||
}
|
||||
|
||||
if ($screenshot->isCompleted()) {
|
||||
// Save image
|
||||
$imageData = $screenshot->downloadImage();
|
||||
file_put_contents('screenshot.webp', $imageData);
|
||||
|
||||
echo "Saved {$screenshot->getWidth()}x{$screenshot->getHeight()} image";
|
||||
}
|
||||
```
|
||||
|
||||
### Available Fluent Methods
|
||||
|
||||
#### CrawlJobBuilder Methods
|
||||
```php
|
||||
$client->crawl($url)
|
||||
->webhookUrl(string $url) // Webhook notification URL
|
||||
->webhookEventsFilter(array $events) // ['queued', 'processing', 'completed', 'failed']
|
||||
->timeout(int $seconds) // Request timeout (5-300)
|
||||
->delay(int $milliseconds) // Delay before capture (0-30000)
|
||||
->blockAds(bool $block = true) // Block ads via EasyList
|
||||
->blockCookieBanners(bool $block = true) // Block cookie banners
|
||||
->blockTrackers(bool $block = true) // Block tracking scripts
|
||||
->waitUntilNetworkIdle(bool $wait = true) // Wait for network idle
|
||||
->create(); // Execute and return CrawlResponse
|
||||
```
|
||||
|
||||
#### ShotJobBuilder Methods
|
||||
```php
|
||||
$client->shot($url)
|
||||
->webhookUrl(string $url) // Webhook notification URL
|
||||
->webhookEventsFilter(array $events) // ['queued', 'processing', 'completed', 'failed']
|
||||
->viewportSize(int $width, int $height) // Viewport dimensions
|
||||
->quality(int $quality) // Image quality 1-100
|
||||
->timeout(int $seconds) // Request timeout (5-300)
|
||||
->delay(int $milliseconds) // Delay before capture (0-30000)
|
||||
->blockAds(bool $block = true) // Block ads via EasyList
|
||||
->blockCookieBanners(bool $block = true) // Block cookie banners
|
||||
->blockTrackers(bool $block = true) // Block tracking scripts
|
||||
->create(); // Execute and return ShotResponse
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Response Objects
|
||||
|
||||
The client library provides typed response objects that make it easy to work with job results.
|
||||
|
||||
### Common Methods (Both CrawlResponse & ShotResponse)
|
||||
|
||||
```php
|
||||
// Job information
|
||||
$response->getUuid(): string // Job UUID
|
||||
$response->getStatus(): string // queued|processing|completed|failed
|
||||
$response->getUrl(): string // Original URL
|
||||
$response->getCreatedAt(): \DateTime // Job creation time
|
||||
$response->getStartedAt(): ?\DateTime // Processing start time (null if not started)
|
||||
$response->getCompletedAt(): ?\DateTime // Completion time (null if not completed)
|
||||
$response->getError(): ?string // Error message (null if no error)
|
||||
|
||||
// Status checks
|
||||
$response->isQueued(): bool // Job waiting to start
|
||||
$response->isProcessing(): bool // Job currently running
|
||||
$response->isCompleted(): bool // Job finished successfully
|
||||
$response->isFailed(): bool // Job encountered error
|
||||
|
||||
// Utility methods
|
||||
$response->refresh(): static // Refresh from API
|
||||
$response->getRawResponse(): array // Original API response
|
||||
$response->getResult(): ?array // Result data (null if not completed)
|
||||
```
|
||||
|
||||
### CrawlResponse Specific Methods
|
||||
|
||||
```php
|
||||
// HTML content access
|
||||
$crawl->getResultRaw(): ?string // Raw HTML content
|
||||
$crawl->getResultUrl(): ?string // Download URL (/api/crawl/{uuid}.html)
|
||||
$crawl->downloadHtml(): ?string // Direct download HTML content
|
||||
|
||||
// Example usage
|
||||
if ($crawl->isCompleted()) {
|
||||
$html = $crawl->getResultRaw();
|
||||
$downloadUrl = $crawl->getResultUrl();
|
||||
|
||||
// Or download directly
|
||||
$htmlContent = $crawl->downloadHtml();
|
||||
file_put_contents('page.html', $htmlContent);
|
||||
}
|
||||
```
|
||||
|
||||
### ShotResponse Specific Methods
|
||||
|
||||
```php
|
||||
// Image data access
|
||||
$shot->getImageData(): ?string // Base64 encoded image
|
||||
$shot->getImageUrl(): ?string // Download URL (/api/shot/{uuid}.webp)
|
||||
$shot->downloadImage(): ?string // Direct download binary data
|
||||
|
||||
// Image metadata
|
||||
$shot->getMimeType(): ?string // image/webp
|
||||
$shot->getFormat(): ?string // webp
|
||||
$shot->getWidth(): ?int // Image width in pixels
|
||||
$shot->getHeight(): ?int // Image height in pixels
|
||||
$shot->getSize(): ?int // File size in bytes
|
||||
$shot->getDimensions(): ?array // [width, height] or null
|
||||
|
||||
// Example usage
|
||||
if ($shot->isCompleted()) {
|
||||
$imageData = $shot->getImageData(); // Base64
|
||||
$imageBinary = $shot->downloadImage(); // Binary
|
||||
$dimensions = $shot->getDimensions(); // [1920, 1080]
|
||||
|
||||
echo "Format: {$shot->getFormat()}"; // webp
|
||||
echo "Size: {$dimensions[0]}x{$dimensions[1]}"; // 1920x1080
|
||||
echo "File size: {$shot->getSize()} bytes"; // 45678 bytes
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Webhook Integration
|
||||
|
||||
Webhooks provide real-time notifications when job statuses change, eliminating the need for constant polling.
|
||||
|
||||
### Basic Webhook Setup
|
||||
|
||||
```php
|
||||
// Configure webhook when creating jobs
|
||||
$crawl = $client->crawl('https://example.com')
|
||||
->webhookUrl('https://myapp.com/webhooks/crawlshot')
|
||||
->webhookEventsFilter(['completed', 'failed'])
|
||||
->create();
|
||||
|
||||
// Your webhook endpoint receives the same data as status APIs
|
||||
```
|
||||
|
||||
### Webhook Event Filtering
|
||||
|
||||
Control which status changes trigger webhooks:
|
||||
|
||||
```php
|
||||
// Only notify on completion
|
||||
->webhookEventsFilter(['completed'])
|
||||
|
||||
// Only notify on completion or failure
|
||||
->webhookEventsFilter(['completed', 'failed'])
|
||||
|
||||
// Notify on all status changes (default)
|
||||
->webhookEventsFilter(['queued', 'processing', 'completed', 'failed'])
|
||||
|
||||
// Disable webhooks entirely
|
||||
->webhookEventsFilter([])
|
||||
```
|
||||
|
||||
### Webhook Handler Example
|
||||
|
||||
```php
|
||||
// routes/web.php or routes/api.php
|
||||
Route::post('/webhooks/crawlshot', function (Request $request) {
|
||||
$jobData = $request->all();
|
||||
|
||||
// The webhook payload is identical to GET /api/crawl/{uuid} response
|
||||
$uuid = $jobData['uuid'];
|
||||
$status = $jobData['status'];
|
||||
$url = $jobData['url'];
|
||||
|
||||
switch ($status) {
|
||||
case 'completed':
|
||||
if (isset($jobData['result']['html'])) {
|
||||
// Handle crawl completion
|
||||
$html = $jobData['result']['html']['raw'];
|
||||
// Process HTML content...
|
||||
} elseif (isset($jobData['result']['image'])) {
|
||||
// Handle screenshot completion
|
||||
$imageUrl = $jobData['result']['image']['url'];
|
||||
$dimensions = [$jobData['result']['width'], $jobData['result']['height']];
|
||||
// Process screenshot...
|
||||
}
|
||||
break;
|
||||
|
||||
case 'failed':
|
||||
$error = $jobData['error'];
|
||||
Log::error("Crawlshot job {$uuid} failed: {$error}");
|
||||
break;
|
||||
|
||||
case 'processing':
|
||||
Log::info("Crawlshot job {$uuid} started processing");
|
||||
break;
|
||||
}
|
||||
|
||||
return response('OK', 200);
|
||||
});
|
||||
```
|
||||
|
||||
### Webhook Error Management
|
||||
|
||||
When webhooks fail, you can manage them through the client:
|
||||
|
||||
```php
|
||||
// List all jobs with failed webhooks
|
||||
$errors = $client->listWebhookErrors();
|
||||
|
||||
foreach ($errors['jobs'] as $job) {
|
||||
echo "Job {$job['uuid']} webhook failed: {$job['webhook_last_error']}\n";
|
||||
echo "Attempts: {$job['webhook_attempts']}\n";
|
||||
|
||||
// Retry immediately
|
||||
$client->retryWebhook($job['uuid']);
|
||||
|
||||
// Or clear the error without retrying
|
||||
// $client->clearWebhookError($job['uuid']);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Custom Options
|
||||
|
||||
```php
|
||||
// Advanced crawling options
|
||||
$crawl = $client->crawl('https://spa-website.com')
|
||||
->timeout(120) // Long timeout for slow sites
|
||||
->delay(3000) // Wait 3 seconds for JS
|
||||
->waitUntilNetworkIdle(true) // Wait for AJAX requests
|
||||
->blockAds(false) // Allow ads for testing
|
||||
->blockCookieBanners(true) // But block cookie banners
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->create();
|
||||
|
||||
// High-quality screenshots
|
||||
$shot = $client->shot('https://dashboard.example.com')
|
||||
->viewportSize(2560, 1440) // High resolution
|
||||
->quality(95) // High quality
|
||||
->delay(5000) // Wait for dashboard to load
|
||||
->blockAds(true) // Clean screenshot
|
||||
->create();
|
||||
```
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```php
|
||||
$urls = ['https://site1.com', 'https://site2.com', 'https://site3.com'];
|
||||
$jobs = [];
|
||||
|
||||
// Create multiple jobs
|
||||
foreach ($urls as $url) {
|
||||
$job = $client->crawl($url)
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->create();
|
||||
|
||||
$jobs[] = $job;
|
||||
echo "Created job: {$job->getUuid()}\n";
|
||||
}
|
||||
|
||||
// Monitor all jobs
|
||||
while (true) {
|
||||
$completed = 0;
|
||||
$failed = 0;
|
||||
|
||||
foreach ($jobs as $job) {
|
||||
$job->refresh();
|
||||
|
||||
if ($job->isCompleted()) $completed++;
|
||||
if ($job->isFailed()) $failed++;
|
||||
}
|
||||
|
||||
echo "Progress: {$completed} completed, {$failed} failed\n";
|
||||
|
||||
if ($completed + $failed === count($jobs)) {
|
||||
break; // All jobs done
|
||||
}
|
||||
|
||||
sleep(5);
|
||||
}
|
||||
|
||||
// Process results
|
||||
foreach ($jobs as $job) {
|
||||
if ($job->isCompleted()) {
|
||||
$html = $job->getResultRaw();
|
||||
// Process HTML...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Exception Handling
|
||||
|
||||
```php
|
||||
use Crawlshot\Laravel\CrawlshotClient;
|
||||
|
||||
try {
|
||||
$client = new CrawlshotClient('https://crawlshot.test', 'invalid-token');
|
||||
$response = $client->createCrawl('https://example.com');
|
||||
|
||||
} catch (\Exception $e) {
|
||||
if (str_contains($e->getMessage(), 'Unauthenticated')) {
|
||||
echo "Invalid API token\n";
|
||||
} elseif (str_contains($e->getMessage(), '422')) {
|
||||
echo "Validation error: " . $e->getMessage();
|
||||
} else {
|
||||
echo "API error: " . $e->getMessage();
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Response Validation
|
||||
|
||||
```php
|
||||
$shot = $client->getShotStatus($uuid);
|
||||
|
||||
// Always check status before accessing results
|
||||
if ($shot->isCompleted()) {
|
||||
$imageData = $shot->getImageData();
|
||||
|
||||
if ($imageData) {
|
||||
file_put_contents('screenshot.webp', base64_decode($imageData));
|
||||
} else {
|
||||
echo "No image data available\n";
|
||||
}
|
||||
|
||||
} elseif ($shot->isFailed()) {
|
||||
echo "Screenshot failed: " . $shot->getError();
|
||||
|
||||
} else {
|
||||
echo "Still processing... Status: " . $shot->getStatus();
|
||||
}
|
||||
```
|
||||
|
||||
### Common Issues & Solutions
|
||||
|
||||
**1. Connection Timeout**
|
||||
```php
|
||||
// Increase timeout for slow networks
|
||||
$crawl = $client->crawl($url)->timeout(300)->create(); // 5 minutes
|
||||
```
|
||||
|
||||
**2. Invalid URLs**
|
||||
```php
|
||||
// Validate URLs before sending
|
||||
if (filter_var($url, FILTER_VALIDATE_URL)) {
|
||||
$crawl = $client->crawl($url)->create();
|
||||
} else {
|
||||
echo "Invalid URL: {$url}";
|
||||
}
|
||||
```
|
||||
|
||||
**3. Large Files**
|
||||
```php
|
||||
// Handle large responses
|
||||
$shot = $client->getShotStatus($uuid);
|
||||
if ($shot->isCompleted()) {
|
||||
$size = $shot->getSize();
|
||||
if ($size > 10 * 1024 * 1024) { // 10MB
|
||||
echo "Large file ({$size} bytes), downloading directly...";
|
||||
$imageData = $shot->downloadImage(); // More memory efficient
|
||||
} else {
|
||||
$imageData = $shot->getImageData(); // Base64
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use Webhooks for Production
|
||||
|
||||
```php
|
||||
// ❌ Polling (inefficient)
|
||||
do {
|
||||
sleep(5);
|
||||
$status = $client->getCrawlStatus($uuid);
|
||||
} while ($status->isProcessing());
|
||||
|
||||
// ✅ Webhooks (efficient)
|
||||
$crawl = $client->crawl($url)
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->create();
|
||||
```
|
||||
|
||||
### 2. Handle Failures Gracefully
|
||||
|
||||
```php
|
||||
$crawl = $client->crawl($url)
|
||||
->timeout(60)
|
||||
->webhookEventsFilter(['completed', 'failed']) // Include 'failed' events
|
||||
->create();
|
||||
|
||||
// In webhook handler
|
||||
if ($jobData['status'] === 'failed') {
|
||||
// Log error and potentially retry with different settings
|
||||
Log::error("Crawl failed for {$jobData['url']}: {$jobData['error']}");
|
||||
|
||||
// Maybe retry with longer timeout
|
||||
$retry = $client->crawl($jobData['url'])
|
||||
->timeout(120)
|
||||
->create();
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Use Environment-Specific Configuration
|
||||
|
||||
```php
|
||||
// .env.production
|
||||
CRAWLSHOT_BASE_URL=https://crawlshot.production.com
|
||||
CRAWLSHOT_TOKEN=prod_token_here
|
||||
|
||||
// .env.development
|
||||
CRAWLSHOT_BASE_URL=https://crawlshot.test
|
||||
CRAWLSHOT_TOKEN=dev_token_here
|
||||
|
||||
// .env.testing
|
||||
CRAWLSHOT_BASE_URL=https://crawlshot.staging.com
|
||||
CRAWLSHOT_TOKEN=test_token_here
|
||||
```
|
||||
|
||||
### 4. Implement Proper Error Logging
|
||||
|
||||
```php
|
||||
try {
|
||||
$crawl = $client->crawl($url)->create();
|
||||
} catch (\Exception $e) {
|
||||
Log::channel('crawlshot')->error('Crawl creation failed', [
|
||||
'url' => $url,
|
||||
'error' => $e->getMessage(),
|
||||
'trace' => $e->getTraceAsString()
|
||||
]);
|
||||
|
||||
throw $e; // Re-throw if needed
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Monitor Webhook Failures
|
||||
|
||||
```php
|
||||
// Scheduled job to check webhook failures
|
||||
Schedule::call(function () {
|
||||
$client = app(CrawlshotClient::class);
|
||||
$errors = $client->listWebhookErrors();
|
||||
|
||||
if ($errors['pagination']['total_items'] > 0) {
|
||||
Log::warning('Webhook failures detected', [
|
||||
'count' => $errors['pagination']['total_items']
|
||||
]);
|
||||
|
||||
// Optionally retry recent failures
|
||||
foreach ($errors['jobs'] as $job) {
|
||||
if ($job['webhook_attempts'] < 3) { // Don't retry too many times
|
||||
$client->retryWebhook($job['uuid']);
|
||||
}
|
||||
}
|
||||
}
|
||||
})->hourly();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Examples
|
||||
|
||||
### Content Monitoring System
|
||||
|
||||
```php
|
||||
class ContentMonitor
|
||||
{
|
||||
private CrawlshotClient $client;
|
||||
|
||||
public function __construct(CrawlshotClient $client)
|
||||
{
|
||||
$this->client = $client;
|
||||
}
|
||||
|
||||
public function monitorWebsite(string $url): void
|
||||
{
|
||||
$crawl = $this->client->crawl($url)
|
||||
->blockAds(true)
|
||||
->blockCookieBanners(true)
|
||||
->timeout(60)
|
||||
->webhookUrl(route('webhook.crawlshot'))
|
||||
->webhookEventsFilter(['completed', 'failed'])
|
||||
->create();
|
||||
|
||||
// Store job info for later processing
|
||||
MonitorJob::create([
|
||||
'uuid' => $crawl->getUuid(),
|
||||
'url' => $url,
|
||||
'status' => 'queued',
|
||||
'created_at' => now()
|
||||
]);
|
||||
}
|
||||
|
||||
public function handleWebhook(array $data): void
|
||||
{
|
||||
$monitorJob = MonitorJob::where('uuid', $data['uuid'])->first();
|
||||
|
||||
if (!$monitorJob) return;
|
||||
|
||||
$monitorJob->update(['status' => $data['status']]);
|
||||
|
||||
if ($data['status'] === 'completed') {
|
||||
$html = $data['result']['html']['raw'];
|
||||
|
||||
// Check for changes
|
||||
$previousHash = $monitorJob->content_hash;
|
||||
$currentHash = md5($html);
|
||||
|
||||
if ($previousHash && $previousHash !== $currentHash) {
|
||||
// Content changed, send notification
|
||||
Mail::to('admin@example.com')->send(
|
||||
new ContentChangedNotification($monitorJob->url, $html)
|
||||
);
|
||||
}
|
||||
|
||||
$monitorJob->update(['content_hash' => $currentHash]);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Screenshot Gallery Generator
|
||||
|
||||
```php
|
||||
class ScreenshotGallery
|
||||
{
|
||||
private CrawlshotClient $client;
|
||||
|
||||
public function generateGallery(array $urls): array
|
||||
{
|
||||
$jobs = [];
|
||||
|
||||
// Create all screenshot jobs
|
||||
foreach ($urls as $url) {
|
||||
$shot = $this->client->shot($url)
|
||||
->viewportSize(1200, 800)
|
||||
->quality(80)
|
||||
->blockAds(true)
|
||||
->delay(2000)
|
||||
->webhookUrl(route('webhook.screenshot'))
|
||||
->create();
|
||||
|
||||
$jobs[] = [
|
||||
'uuid' => $shot->getUuid(),
|
||||
'url' => $url,
|
||||
'response' => $shot
|
||||
];
|
||||
}
|
||||
|
||||
return $jobs;
|
||||
}
|
||||
|
||||
public function handleScreenshotWebhook(array $data): void
|
||||
{
|
||||
if ($data['status'] === 'completed') {
|
||||
// Save screenshot to permanent storage
|
||||
$imageData = base64_decode($data['result']['image']['raw']);
|
||||
$filename = $data['uuid'] . '.webp';
|
||||
|
||||
Storage::disk('public')->put("screenshots/{$filename}", $imageData);
|
||||
|
||||
// Update database
|
||||
Screenshot::updateOrCreate(['uuid' => $data['uuid']], [
|
||||
'url' => $data['url'],
|
||||
'filename' => $filename,
|
||||
'width' => $data['result']['width'],
|
||||
'height' => $data['result']['height'],
|
||||
'size' => $data['result']['size'],
|
||||
'completed_at' => now()
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The Crawlshot PHP Client Library provides a comprehensive, developer-friendly interface for all your web crawling and screenshot needs. With its fluent interface, typed responses, and robust webhook support, it's designed to make integration as smooth as possible while maintaining full access to all advanced features.
|
||||
@@ -5,7 +5,7 @@ meta {
|
||||
}
|
||||
|
||||
get {
|
||||
url: {{base_url}}/api/crawl/:uuid
|
||||
url: {{base_url}}/api/crawl/534d02c4-bef6-4f25-8250-3277cc21ca45
|
||||
body: none
|
||||
auth: bearer
|
||||
}
|
||||
|
||||
327
README.md
327
README.md
@@ -1,198 +1,213 @@
|
||||
# Crawlshot
|
||||
|
||||
A Laravel web crawling and screenshot service with dual deployment options:
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://laravel.com)
|
||||
[](https://php.net)
|
||||
|
||||
1. **Standalone API Service** - Full Laravel application with REST API endpoints
|
||||
2. **Laravel Package** - HTTP client package for use in other Laravel applications
|
||||
**High-performance web crawling and screenshot service** built with Laravel, featuring intelligent ad blocking, webhook notifications, and a powerful fluent PHP client.
|
||||
|
||||
## Architecture Overview
|
||||
🎯 **Perfect for:** Content monitoring • Screenshot automation • QA testing • Social media previews • Compliance archival
|
||||
|
||||
### Standalone API Service
|
||||
The main Laravel application provides a complete web crawling and screenshot service:
|
||||
## ✨ Key Features
|
||||
|
||||
- **Spatie Browsershot Integration** - Uses Puppeteer for browser automation
|
||||
- **EasyList Ad Blocking** - Automatic ad/tracker blocking using EasyList filters
|
||||
- **Queue Processing** - Laravel Horizon for async job processing
|
||||
- **24-hour Cleanup** - Automatic file and database cleanup
|
||||
- **Sanctum Authentication** - API token-based authentication
|
||||
- **SQLite Database** - Stores job metadata and processing status
|
||||
- 🚀 **Dual Deployment**: Standalone API service or Laravel package
|
||||
- 🔗 **Webhook Notifications**: Real-time updates with progressive retry
|
||||
- 🎨 **Fluent Interface**: `$client->crawl($url)->webhookUrl($webhook)->create()`
|
||||
- 📦 **Typed Responses**: `$result->isCompleted()`, `$shot->getDimensions()`
|
||||
- 🛡️ **Smart Blocking**: EasyList ad/tracker/cookie banner filtering
|
||||
- ⚡ **Background Processing**: Laravel Horizon queue management
|
||||
- 🔄 **Auto-cleanup**: 24-hour file retention with scheduled cleanup
|
||||
- 🔐 **Secure**: Laravel Sanctum API authentication
|
||||
|
||||
### Laravel Package
|
||||
Simple HTTP client package that provides a clean interface to the API:
|
||||
## 📚 Documentation
|
||||
|
||||
- **8 Methods for 8 APIs** - Direct 1:1 mapping to REST endpoints
|
||||
- **Facade Support** - Clean Laravel integration
|
||||
- **Auto-discovery** - Automatic service provider registration
|
||||
- 📖 **[API Documentation](API_DOCUMENTATION.md)** - Complete REST API reference with webhook system
|
||||
- 🔧 **[Client Documentation](CLIENT_DOCUMENTATION.md)** - PHP client library guide with fluent interface
|
||||
- ⚙️ **[Setup Guide](SETUP.md)** - Detailed installation and configuration
|
||||
|
||||
## Deployment Options
|
||||
## 🚀 Quick Start
|
||||
|
||||
### Option 1: Standalone API Service
|
||||
|
||||
Deploy as a complete Laravel application:
|
||||
Deploy your own Crawlshot API server:
|
||||
|
||||
```bash
|
||||
git clone [repository]
|
||||
cd crawlshot
|
||||
composer install
|
||||
npm install puppeteer
|
||||
php artisan migrate
|
||||
php artisan serve
|
||||
```
|
||||
|
||||
**API Endpoints:**
|
||||
- `POST /api/crawl` - Create HTML crawl job
|
||||
- `GET /api/crawl/{uuid}` - Get crawl status/result
|
||||
- `GET /api/crawl` - List all crawl jobs
|
||||
- `POST /api/shot` - Create screenshot job
|
||||
- `GET /api/shot/{uuid}` - Get screenshot status/result
|
||||
- `GET /api/shot/{uuid}/download` - Download screenshot file
|
||||
- `GET /api/shot` - List all screenshot jobs
|
||||
- `GET /api/health` - Health check
|
||||
|
||||
**Example API Usage:**
|
||||
```bash
|
||||
# Create crawl job
|
||||
curl -X POST "https://crawlshot.test/api/crawl" \
|
||||
-H "Authorization: Bearer {token}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com", "block_ads": true}'
|
||||
|
||||
# Check status
|
||||
curl -H "Authorization: Bearer {token}" \
|
||||
"https://crawlshot.test/api/crawl/{uuid}"
|
||||
composer install && npm install puppeteer
|
||||
php artisan migrate && php artisan serve
|
||||
```
|
||||
|
||||
### Option 2: Laravel Package
|
||||
|
||||
Install as a package in your Laravel application:
|
||||
Use as a client library in your Laravel app:
|
||||
|
||||
```bash
|
||||
composer require crawlshot/laravel
|
||||
php artisan vendor:publish --tag=crawlshot-config
|
||||
```
|
||||
|
||||
**Configuration:**
|
||||
```env
|
||||
CRAWLSHOT_BASE_URL=https://your-crawlshot-api.com
|
||||
CRAWLSHOT_TOKEN=your-sanctum-token
|
||||
```
|
||||
|
||||
**Package Usage:**
|
||||
```php
|
||||
use Crawlshot\Laravel\Facades\Crawlshot;
|
||||
|
||||
// Create crawl job
|
||||
$response = Crawlshot::createCrawl('https://example.com', [
|
||||
'block_ads' => true,
|
||||
'timeout' => 30
|
||||
]);
|
||||
|
||||
// Check status
|
||||
$status = Crawlshot::getCrawlStatus($response['uuid']);
|
||||
|
||||
// Create screenshot
|
||||
$response = Crawlshot::createShot('https://example.com', [
|
||||
'format' => 'jpg',
|
||||
'width' => 1920,
|
||||
'height' => 1080
|
||||
]);
|
||||
|
||||
// Download screenshot
|
||||
$imageData = Crawlshot::downloadShot($response['uuid']);
|
||||
file_put_contents('screenshot.jpg', $imageData);
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### Available Methods (Package)
|
||||
|
||||
| Method | API Endpoint | Description |
|
||||
|--------|--------------|-------------|
|
||||
| `createCrawl(string $url, array $options = [])` | `POST /api/crawl` | Create crawl job |
|
||||
| `getCrawlStatus(string $uuid)` | `GET /api/crawl/{uuid}` | Get crawl status |
|
||||
| `listCrawls()` | `GET /api/crawl` | List all crawl jobs |
|
||||
| `createShot(string $url, array $options = [])` | `POST /api/shot` | Create screenshot job |
|
||||
| `getShotStatus(string $uuid)` | `GET /api/shot/{uuid}` | Get screenshot status |
|
||||
| `downloadShot(string $uuid)` | `GET /api/shot/{uuid}/download` | Download screenshot file |
|
||||
| `listShots()` | `GET /api/shot` | List all screenshot jobs |
|
||||
| `health()` | `GET /api/health` | Health check |
|
||||
|
||||
### Crawl Options
|
||||
|
||||
```php
|
||||
[
|
||||
'block_ads' => true, // Block ads using EasyList
|
||||
'block_trackers' => true, // Block tracking scripts
|
||||
'timeout' => 30, // Request timeout in seconds
|
||||
'user_agent' => 'Custom UA', // Custom user agent
|
||||
'wait_until' => 'networkidle0' // Wait condition
|
||||
]
|
||||
$client = new CrawlshotClient('https://crawlshot.test', 'your-token');
|
||||
```
|
||||
|
||||
### Screenshot Options
|
||||
## ⚡ Modern Usage Examples
|
||||
|
||||
### Fluent Interface with Webhooks
|
||||
|
||||
```php
|
||||
[
|
||||
'format' => 'jpg', // jpg, png, webp
|
||||
'quality' => 90, // 1-100 for jpg/webp
|
||||
'width' => 1920, // Viewport width
|
||||
'height' => 1080, // Viewport height
|
||||
'full_page' => true, // Capture full page
|
||||
'block_ads' => true, // Block ads
|
||||
'timeout' => 30 // Request timeout
|
||||
]
|
||||
use Crawlshot\Laravel\CrawlshotClient;
|
||||
|
||||
$client = new CrawlshotClient('https://crawlshot.test', 'your-token');
|
||||
|
||||
// HTML Crawling with webhook notifications
|
||||
$crawl = $client->crawl('https://example.com')
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->webhookEventsFilter(['completed', 'failed'])
|
||||
->blockAds(true)
|
||||
->timeout(60)
|
||||
->create();
|
||||
|
||||
echo "Job: {$crawl->getUuid()} - Status: {$crawl->getStatus()}";
|
||||
|
||||
// Screenshot with custom dimensions
|
||||
$shot = $client->shot('https://dashboard.example.com')
|
||||
->viewportSize(1920, 1080)
|
||||
->quality(90)
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->create();
|
||||
|
||||
if ($shot->isCompleted()) {
|
||||
$dimensions = $shot->getDimensions(); // [1920, 1080]
|
||||
$imageData = $shot->downloadImage(); // Binary data
|
||||
}
|
||||
```
|
||||
|
||||
## Features
|
||||
### Webhook Handler Example
|
||||
|
||||
### Core Functionality
|
||||
- **HTML Crawling** - Extract clean HTML content from web pages
|
||||
- **Screenshot Capture** - Generate high-quality screenshots (JPG, PNG, WebP)
|
||||
- **Ad Blocking** - Built-in EasyList integration for ad/tracker blocking
|
||||
- **Queue Processing** - Async job processing with Laravel Horizon
|
||||
- **File Management** - Automatic cleanup after 24 hours
|
||||
```php
|
||||
Route::post('/webhook', function (Request $request) {
|
||||
$job = $request->all();
|
||||
|
||||
if ($job['status'] === 'completed') {
|
||||
if (isset($job['result']['html'])) {
|
||||
// Process HTML crawl result
|
||||
$html = $job['result']['html']['raw'];
|
||||
} elseif (isset($job['result']['image'])) {
|
||||
// Process screenshot result
|
||||
$imageUrl = $job['result']['image']['url'];
|
||||
}
|
||||
}
|
||||
|
||||
return response('OK', 200);
|
||||
});
|
||||
```
|
||||
|
||||
### Technical Features
|
||||
- **Laravel 12** support with PHP 8.3+
|
||||
- **Puppeteer Integration** via Spatie Browsershot
|
||||
- **Sanctum Authentication** for API security
|
||||
- **SQLite Database** with migrations
|
||||
- **Auto-discovery** for package installation
|
||||
- **Environment Configuration** via .env variables
|
||||
### Direct API Usage
|
||||
|
||||
## Development
|
||||
```bash
|
||||
# HTML crawl with webhook
|
||||
curl -X POST "https://crawlshot.test/api/crawl" \
|
||||
-H "Authorization: Bearer YOUR_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_events_filter": ["completed"],
|
||||
"block_ads": true
|
||||
}'
|
||||
|
||||
### Requirements
|
||||
- PHP 8.3+
|
||||
- Laravel 12.0+
|
||||
- Node.js with Puppeteer
|
||||
- SQLite (or other database)
|
||||
- ImageMagick extension
|
||||
# Screenshot with custom viewport
|
||||
curl -X POST "https://crawlshot.test/api/shot" \
|
||||
-H "Authorization: Bearer YOUR_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com",
|
||||
"viewport_width": 1200,
|
||||
"viewport_height": 800,
|
||||
"webhook_url": "https://myapp.com/webhook"
|
||||
}'
|
||||
```
|
||||
|
||||
## 🎯 Core APIs
|
||||
|
||||
### HTML Crawling
|
||||
- `POST /api/crawl` - Create HTML crawl job with ad blocking
|
||||
- `GET /api/crawl/{uuid}` - Get crawl status and results
|
||||
- `GET /api/crawl/{uuid}.html` - Download HTML file directly
|
||||
|
||||
### Screenshot Capture
|
||||
- `POST /api/shot` - Create screenshot job (always WebP format)
|
||||
- `GET /api/shot/{uuid}` - Get screenshot status and results
|
||||
- `GET /api/shot/{uuid}.webp` - Download image file directly
|
||||
|
||||
### Webhook Management
|
||||
- `GET /api/webhook-errors` - List failed webhook deliveries
|
||||
- `POST /api/webhook-errors/{uuid}/retry` - Retry failed webhook
|
||||
- `DELETE /api/webhook-errors/{uuid}/clear` - Clear webhook error
|
||||
|
||||
### Client Library Methods
|
||||
|
||||
| Method | Returns | Description |
|
||||
|--------|---------|-------------|
|
||||
| `$client->crawl($url)->create()` | `CrawlResponse` | Fluent crawl job creation |
|
||||
| `$client->getCrawlStatus($uuid)` | `CrawlResponse` | Typed crawl status |
|
||||
| `$client->shot($url)->create()` | `ShotResponse` | Fluent screenshot creation |
|
||||
| `$client->getShotStatus($uuid)` | `ShotResponse` | Typed screenshot status |
|
||||
| `$client->listWebhookErrors()` | `array` | Failed webhook list |
|
||||
|
||||
## 🔧 Architecture & Features
|
||||
|
||||
### Webhook System
|
||||
- **Event Filtering** - Choose which status changes trigger webhooks (`queued`, `processing`, `completed`, `failed`)
|
||||
- **Progressive Retry** - Automatic retry with exponential backoff (1, 2, 4, 8, 16, 32 minutes)
|
||||
- **Error Management** - List, retry, and clear failed webhook deliveries
|
||||
- **Consistent Payload** - Webhook data matches status API responses exactly
|
||||
|
||||
### Smart Filtering
|
||||
- **EasyList Integration** - Automatic ad/tracker/cookie banner blocking
|
||||
- **Custom Blocking** - Fine-grained control over content filtering
|
||||
- **Performance Optimized** - Cached filter lists with 24-hour updates
|
||||
|
||||
### Developer Experience
|
||||
- **Fluent Interface** - Method chaining for clean, readable code
|
||||
- **Typed Responses** - `CrawlResponse` and `ShotResponse` classes with helpful methods
|
||||
- **Laravel Integration** - Service providers, facades, auto-discovery
|
||||
- **Comprehensive Docs** - Complete API and client documentation
|
||||
|
||||
## 🛠️ Requirements & Setup
|
||||
|
||||
### System Requirements
|
||||
- **PHP 8.3+** with ImageMagick extension
|
||||
- **Laravel 12.0+** framework
|
||||
- **Node.js** with Puppeteer for browser automation
|
||||
- **Database** (SQLite included, MySQL/PostgreSQL supported)
|
||||
|
||||
### Quick Setup
|
||||
```bash
|
||||
# Clone and install
|
||||
git clone [repository] && cd crawlshot
|
||||
composer install && npm install puppeteer
|
||||
|
||||
# Configure and run
|
||||
cp .env.example .env
|
||||
php artisan key:generate
|
||||
php artisan migrate
|
||||
php artisan serve
|
||||
|
||||
# Start queue processing (separate terminal)
|
||||
php artisan horizon
|
||||
```
|
||||
|
||||
### Key Dependencies
|
||||
- `spatie/browsershot` - Browser automation
|
||||
- `protonlabs/php-adblock-parser` - EasyList parsing
|
||||
- `laravel/horizon` - Queue monitoring (standalone)
|
||||
- `laravel/sanctum` - API authentication (standalone)
|
||||
- **[Spatie Browsershot](https://github.com/spatie/browsershot)** - Puppeteer wrapper for browser automation
|
||||
- **[Laravel Horizon](https://laravel.com/docs/horizon)** - Queue monitoring and management
|
||||
- **[Laravel Sanctum](https://laravel.com/docs/sanctum)** - API authentication
|
||||
- **ProtonMail AdBlock Parser** - EasyList filter processing
|
||||
|
||||
### File Structure
|
||||
## 📄 License
|
||||
|
||||
```
|
||||
├── app/ # Laravel application (standalone)
|
||||
│ ├── Http/Controllers/Api/ # API controllers
|
||||
│ ├── Jobs/ # Queue jobs
|
||||
│ ├── Models/ # Eloquent models
|
||||
│ └── Services/ # Core services
|
||||
├── src/ # Package source (both modes)
|
||||
│ ├── CrawlshotClient.php # HTTP client (package mode)
|
||||
│ ├── CrawlshotServiceProvider.php
|
||||
│ ├── Facades/Crawlshot.php
|
||||
│ └── config/crawlshot.php
|
||||
├── routes/api.php # API routes (standalone)
|
||||
├── database/migrations/ # Database schema
|
||||
└── composer.json # Package definition
|
||||
```
|
||||
MIT License - see [LICENSE](LICENSE) file for details.
|
||||
|
||||
## License
|
||||
---
|
||||
|
||||
MIT
|
||||
**[Get Started →](CLIENT_DOCUMENTATION.md)** | **[View API Docs →](API_DOCUMENTATION.md)** | **[Setup Guide →](SETUP.md)**
|
||||
@@ -23,7 +23,9 @@ public function crawl(Request $request): JsonResponse
|
||||
'block_ads' => 'boolean',
|
||||
'block_cookie_banners' => 'boolean',
|
||||
'block_trackers' => 'boolean',
|
||||
'wait_until_network_idle' => 'boolean'
|
||||
'webhook_url' => 'nullable|url|max:2048',
|
||||
'webhook_events_filter' => 'nullable|array',
|
||||
'webhook_events_filter.*' => 'in:queued,processing,completed,failed'
|
||||
]);
|
||||
|
||||
$uuid = Str::uuid()->toString();
|
||||
@@ -33,13 +35,15 @@ public function crawl(Request $request): JsonResponse
|
||||
'type' => 'crawl',
|
||||
'url' => $validated['url'],
|
||||
'status' => 'queued',
|
||||
'webhook_url' => $validated['webhook_url'] ?? null,
|
||||
'webhook_events_filter' => isset($validated['webhook_events_filter']) ? $validated['webhook_events_filter'] : ['queued', 'processing', 'completed', 'failed'],
|
||||
'parameters' => array_filter([
|
||||
'timeout' => $validated['timeout'] ?? 30,
|
||||
'delay' => $validated['delay'] ?? 0,
|
||||
'block_ads' => $validated['block_ads'] ?? true,
|
||||
'block_cookie_banners' => $validated['block_cookie_banners'] ?? true,
|
||||
'block_trackers' => $validated['block_trackers'] ?? true,
|
||||
'wait_until_network_idle' => $validated['wait_until_network_idle'] ?? false
|
||||
'block_trackers' => $validated['block_trackers'] ?? true
|
||||
// wait_until_network_idle is always enabled in BrowsershotService
|
||||
])
|
||||
]);
|
||||
|
||||
|
||||
@@ -24,7 +24,10 @@ public function shot(Request $request): JsonResponse
|
||||
'delay' => 'integer|min:0|max:30000',
|
||||
'block_ads' => 'boolean',
|
||||
'block_cookie_banners' => 'boolean',
|
||||
'block_trackers' => 'boolean'
|
||||
'block_trackers' => 'boolean',
|
||||
'webhook_url' => 'nullable|url|max:2048',
|
||||
'webhook_events_filter' => 'nullable|array',
|
||||
'webhook_events_filter.*' => 'in:queued,processing,completed,failed'
|
||||
]);
|
||||
|
||||
$uuid = Str::uuid()->toString();
|
||||
@@ -34,6 +37,8 @@ public function shot(Request $request): JsonResponse
|
||||
'type' => 'shot',
|
||||
'url' => $validated['url'],
|
||||
'status' => 'queued',
|
||||
'webhook_url' => $validated['webhook_url'] ?? null,
|
||||
'webhook_events_filter' => isset($validated['webhook_events_filter']) ? $validated['webhook_events_filter'] : ['queued', 'processing', 'completed', 'failed'],
|
||||
'parameters' => array_filter([
|
||||
'viewport_width' => $validated['viewport_width'] ?? 1920,
|
||||
'viewport_height' => $validated['viewport_height'] ?? 1080,
|
||||
|
||||
72
app/Http/Controllers/Api/WebhookErrorController.php
Normal file
72
app/Http/Controllers/Api/WebhookErrorController.php
Normal file
@@ -0,0 +1,72 @@
|
||||
<?php
|
||||
|
||||
namespace App\Http\Controllers\Api;
|
||||
|
||||
use App\Http\Controllers\Controller;
|
||||
use App\Models\CrawlShotJob;
|
||||
use App\Services\WebhookService;
|
||||
use Illuminate\Http\JsonResponse;
|
||||
|
||||
class WebhookErrorController extends Controller
|
||||
{
|
||||
public function index(): JsonResponse
|
||||
{
|
||||
$jobs = CrawlShotJob::where('webhook_attempts', '>', 0)
|
||||
->whereNotNull('webhook_url')
|
||||
->orderBy('updated_at', 'desc')
|
||||
->paginate(20);
|
||||
|
||||
$response = [
|
||||
'jobs' => $jobs->items(),
|
||||
'pagination' => [
|
||||
'current_page' => $jobs->currentPage(),
|
||||
'total_pages' => $jobs->lastPage(),
|
||||
'total_items' => $jobs->total(),
|
||||
'per_page' => $jobs->perPage()
|
||||
]
|
||||
];
|
||||
|
||||
return response()->json($response);
|
||||
}
|
||||
|
||||
public function retry(string $uuid): JsonResponse
|
||||
{
|
||||
$job = CrawlShotJob::where('uuid', $uuid)->first();
|
||||
|
||||
if (!$job) {
|
||||
return response()->json(['error' => 'Job not found'], 404);
|
||||
}
|
||||
|
||||
if (!$job->webhook_url) {
|
||||
return response()->json(['error' => 'Job has no webhook URL'], 400);
|
||||
}
|
||||
|
||||
// Attempt webhook immediately
|
||||
WebhookService::send($job);
|
||||
|
||||
return response()->json([
|
||||
'uuid' => $job->uuid,
|
||||
'message' => 'Webhook retry attempted'
|
||||
]);
|
||||
}
|
||||
|
||||
public function clear(string $uuid): JsonResponse
|
||||
{
|
||||
$job = CrawlShotJob::where('uuid', $uuid)->first();
|
||||
|
||||
if (!$job) {
|
||||
return response()->json(['error' => 'Job not found'], 404);
|
||||
}
|
||||
|
||||
$job->update([
|
||||
'webhook_attempts' => 0,
|
||||
'webhook_last_error' => null,
|
||||
'webhook_next_retry_at' => null
|
||||
]);
|
||||
|
||||
return response()->json([
|
||||
'uuid' => $job->uuid,
|
||||
'message' => 'Webhook error cleared'
|
||||
]);
|
||||
}
|
||||
}
|
||||
40
app/Jobs/RetryWebhookJob.php
Normal file
40
app/Jobs/RetryWebhookJob.php
Normal file
@@ -0,0 +1,40 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Models\CrawlShotJob;
|
||||
use App\Services\WebhookService;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Bus\Dispatchable;
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
|
||||
class RetryWebhookJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
protected string $jobUuid;
|
||||
|
||||
public function __construct(string $jobUuid)
|
||||
{
|
||||
$this->jobUuid = $jobUuid;
|
||||
}
|
||||
|
||||
public function handle(): void
|
||||
{
|
||||
$job = CrawlShotJob::where('uuid', $this->jobUuid)->first();
|
||||
|
||||
if (!$job || !$job->webhook_url) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if job still needs retry (in case it was manually cleared)
|
||||
if (!$job->webhook_next_retry_at || $job->webhook_next_retry_at->isFuture()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt webhook again
|
||||
WebhookService::send($job);
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
use Illuminate\Database\Eloquent\Factories\HasFactory;
|
||||
use Illuminate\Database\Eloquent\Model;
|
||||
use Illuminate\Support\Facades\Storage;
|
||||
|
||||
class CrawlShotJob extends Model
|
||||
{
|
||||
@@ -18,17 +19,72 @@ class CrawlShotJob extends Model
|
||||
'file_path',
|
||||
'error_message',
|
||||
'started_at',
|
||||
'completed_at'
|
||||
'completed_at',
|
||||
'webhook_url',
|
||||
'webhook_events_filter',
|
||||
'webhook_attempts',
|
||||
'webhook_last_error',
|
||||
'webhook_next_retry_at'
|
||||
];
|
||||
|
||||
protected $casts = [
|
||||
'parameters' => 'array',
|
||||
'webhook_events_filter' => 'array',
|
||||
'started_at' => 'datetime',
|
||||
'completed_at' => 'datetime'
|
||||
'completed_at' => 'datetime',
|
||||
'webhook_next_retry_at' => 'datetime'
|
||||
];
|
||||
|
||||
public function getRouteKeyName()
|
||||
{
|
||||
return 'uuid';
|
||||
}
|
||||
|
||||
public function buildStatusResponse(): array
|
||||
{
|
||||
$response = [
|
||||
'uuid' => $this->uuid,
|
||||
'status' => $this->status,
|
||||
'url' => $this->url,
|
||||
'created_at' => $this->created_at->toISOString()
|
||||
];
|
||||
|
||||
if ($this->started_at) {
|
||||
$response['started_at'] = $this->started_at->toISOString();
|
||||
}
|
||||
|
||||
if ($this->completed_at) {
|
||||
$response['completed_at'] = $this->completed_at->toISOString();
|
||||
}
|
||||
|
||||
if ($this->status === 'completed' && $this->file_path) {
|
||||
if ($this->type === 'crawl') {
|
||||
$response['result'] = [
|
||||
'html' => [
|
||||
'url' => url("/api/crawl/{$this->uuid}.html"),
|
||||
'raw' => Storage::get($this->file_path)
|
||||
]
|
||||
];
|
||||
} elseif ($this->type === 'shot') {
|
||||
$imageData = Storage::get($this->file_path);
|
||||
$response['result'] = [
|
||||
'image' => [
|
||||
'url' => url("/api/shot/{$this->uuid}.webp"),
|
||||
'raw' => base64_encode($imageData),
|
||||
],
|
||||
'mime_type' => 'image/webp',
|
||||
'format' => 'webp',
|
||||
'width' => $this->parameters['viewport_width'] ?? 1920,
|
||||
'height' => $this->parameters['viewport_height'] ?? 1080,
|
||||
'size' => strlen($imageData)
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->status === 'failed' && $this->error_message) {
|
||||
$response['error'] = $this->error_message;
|
||||
}
|
||||
|
||||
return $response;
|
||||
}
|
||||
}
|
||||
27
app/Observers/CrawlShotJobObserver.php
Normal file
27
app/Observers/CrawlShotJobObserver.php
Normal file
@@ -0,0 +1,27 @@
|
||||
<?php
|
||||
|
||||
namespace App\Observers;
|
||||
|
||||
use App\Models\CrawlShotJob;
|
||||
use App\Services\WebhookService;
|
||||
|
||||
class CrawlShotJobObserver
|
||||
{
|
||||
public function updated(CrawlShotJob $crawlShotJob): void
|
||||
{
|
||||
// Only fire webhook if status has changed and webhook_url is set
|
||||
if ($crawlShotJob->isDirty('status') && $crawlShotJob->webhook_url) {
|
||||
$eventsFilter = $crawlShotJob->webhook_events_filter ?? ['queued', 'processing', 'completed', 'failed'];
|
||||
|
||||
// Don't fire webhook if filter is empty array
|
||||
if (empty($eventsFilter)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Only fire webhook if current status is in the filter
|
||||
if (in_array($crawlShotJob->status, $eventsFilter)) {
|
||||
WebhookService::send($crawlShotJob);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
namespace App\Providers;
|
||||
|
||||
use App\Models\CrawlShotJob;
|
||||
use App\Observers\CrawlShotJobObserver;
|
||||
use Illuminate\Support\ServiceProvider;
|
||||
|
||||
class AppServiceProvider extends ServiceProvider
|
||||
@@ -19,6 +21,6 @@ public function register(): void
|
||||
*/
|
||||
public function boot(): void
|
||||
{
|
||||
//
|
||||
CrawlShotJob::observe(CrawlShotJobObserver::class);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ public function takeScreenshot(string $url, array $options = []): array
|
||||
private function configureBrowsershot(string $url, array $options = []): Browsershot
|
||||
{
|
||||
$browsershot = Browsershot::url($url)
|
||||
->waitUntilNetworkIdle()
|
||||
->waitUntilNetworkIdle() // Always enabled for production to ensure proper rendering
|
||||
->preventUnsuccessfulResponse();
|
||||
|
||||
|
||||
@@ -52,18 +52,17 @@ private function configureBrowsershot(string $url, array $options = []): Browser
|
||||
$browsershot->noSandbox();
|
||||
}
|
||||
|
||||
// Basic configuration
|
||||
if (isset($options['timeout'])) {
|
||||
$browsershot->timeout($options['timeout']);
|
||||
}
|
||||
// Basic configuration with maximum timeout safeguard
|
||||
$timeout = $options['timeout'] ?? 30;
|
||||
$maxTimeout = 300; // 5 minutes maximum to prevent indefinite waiting
|
||||
$browsershot->timeout(min($timeout, $maxTimeout));
|
||||
|
||||
if (isset($options['delay'])) {
|
||||
$browsershot->setDelay($options['delay']);
|
||||
}
|
||||
|
||||
if (isset($options['wait_until_network_idle']) && $options['wait_until_network_idle']) {
|
||||
$browsershot->waitUntilNetworkIdle();
|
||||
}
|
||||
// waitUntilNetworkIdle() is always enabled (configured above on line 47)
|
||||
// Removed conditional logic as network idle waiting is required for production
|
||||
|
||||
// Apply ad/tracker blocking
|
||||
if (($options['block_ads'] ?? true) || ($options['block_trackers'] ?? true)) {
|
||||
|
||||
@@ -26,7 +26,8 @@ public function getBlockedDomains(string $url): array
|
||||
}
|
||||
}
|
||||
|
||||
return array_slice(array_unique($domains), 0, 100); // Limit to 100 domains
|
||||
// Limit to 50 most common ad domains to reduce timeout risk
|
||||
return array_slice(array_unique($domains), 0, 50);
|
||||
}
|
||||
|
||||
public function getBlockedUrls(string $url): array
|
||||
@@ -43,7 +44,8 @@ public function getBlockedUrls(string $url): array
|
||||
}
|
||||
}
|
||||
|
||||
return array_slice(array_unique($urls), 0, 50); // Limit to 50 URL patterns
|
||||
// Limit to 25 URL patterns to reduce blocking overhead
|
||||
return array_slice(array_unique($urls), 0, 25);
|
||||
}
|
||||
|
||||
private function getFilters(): array
|
||||
|
||||
62
app/Services/WebhookService.php
Normal file
62
app/Services/WebhookService.php
Normal file
@@ -0,0 +1,62 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services;
|
||||
|
||||
use App\Models\CrawlShotJob;
|
||||
use App\Jobs\RetryWebhookJob;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
|
||||
class WebhookService
|
||||
{
|
||||
private const RETRY_DELAYS = [1, 2, 4, 8, 16, 32]; // minutes
|
||||
|
||||
public static function send(CrawlShotJob $job): void
|
||||
{
|
||||
try {
|
||||
$payload = $job->buildStatusResponse();
|
||||
|
||||
$response = Http::timeout(5)->post($job->webhook_url, $payload);
|
||||
|
||||
if ($response->successful()) {
|
||||
// Reset webhook error fields on success
|
||||
$job->update([
|
||||
'webhook_attempts' => 0,
|
||||
'webhook_last_error' => null,
|
||||
'webhook_next_retry_at' => null
|
||||
]);
|
||||
} else {
|
||||
throw new \Exception("HTTP {$response->status()}: {$response->body()}");
|
||||
}
|
||||
|
||||
} catch (\Exception $e) {
|
||||
self::handleWebhookFailure($job, $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static function handleWebhookFailure(CrawlShotJob $job, string $error): void
|
||||
{
|
||||
$currentAttempts = $job->webhook_attempts ?? 0;
|
||||
|
||||
if ($currentAttempts < 6) {
|
||||
$delayMinutes = self::RETRY_DELAYS[$currentAttempts];
|
||||
$nextRetryAt = now()->addMinutes($delayMinutes);
|
||||
|
||||
$job->update([
|
||||
'webhook_attempts' => $currentAttempts + 1,
|
||||
'webhook_last_error' => $error,
|
||||
'webhook_next_retry_at' => $nextRetryAt
|
||||
]);
|
||||
|
||||
// Schedule retry job
|
||||
RetryWebhookJob::dispatch($job->uuid)->delay($nextRetryAt);
|
||||
} else {
|
||||
// Max attempts reached, just update error
|
||||
$job->update([
|
||||
'webhook_attempts' => $currentAttempts + 1,
|
||||
'webhook_last_error' => $error,
|
||||
'webhook_next_retry_at' => null
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
<?php
|
||||
|
||||
use Illuminate\Database\Migrations\Migration;
|
||||
use Illuminate\Database\Schema\Blueprint;
|
||||
use Illuminate\Support\Facades\Schema;
|
||||
|
||||
return new class extends Migration
|
||||
{
|
||||
/**
|
||||
* Run the migrations.
|
||||
*/
|
||||
public function up(): void
|
||||
{
|
||||
Schema::table('crawl_shot_jobs', function (Blueprint $table) {
|
||||
$table->string('webhook_url')->nullable()->after('parameters');
|
||||
$table->json('webhook_events_filter')->nullable()->after('webhook_url');
|
||||
$table->integer('webhook_attempts')->default(0)->after('webhook_events_filter');
|
||||
$table->text('webhook_last_error')->nullable()->after('webhook_attempts');
|
||||
$table->timestamp('webhook_next_retry_at')->nullable()->after('webhook_last_error');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse the migrations.
|
||||
*/
|
||||
public function down(): void
|
||||
{
|
||||
Schema::table('crawl_shot_jobs', function (Blueprint $table) {
|
||||
$table->dropColumn([
|
||||
'webhook_url',
|
||||
'webhook_events_filter',
|
||||
'webhook_attempts',
|
||||
'webhook_last_error',
|
||||
'webhook_next_retry_at'
|
||||
]);
|
||||
});
|
||||
}
|
||||
};
|
||||
73
example_usage.php
Normal file
73
example_usage.php
Normal file
@@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
require_once 'vendor/autoload.php';
|
||||
|
||||
use Crawlshot\Laravel\CrawlshotClient;
|
||||
|
||||
// Initialize client
|
||||
$client = new CrawlshotClient('https://crawlshot.test', 'your-api-token');
|
||||
|
||||
// Example 1: Fluent interface with webhook
|
||||
$crawlJob = $client->crawl('https://example.com')
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->webhookEventsFilter(['completed', 'failed'])
|
||||
->blockAds(true)
|
||||
->timeout(30)
|
||||
->create();
|
||||
|
||||
echo "Crawl job created: " . $crawlJob->getUuid() . "\n";
|
||||
echo "Status: " . $crawlJob->getStatus() . "\n";
|
||||
echo "Is queued: " . ($crawlJob->isQueued() ? 'yes' : 'no') . "\n";
|
||||
|
||||
// Example 2: Check status with typed response
|
||||
$crawlStatus = $client->getCrawlStatus($crawlJob->getUuid());
|
||||
|
||||
if ($crawlStatus->isCompleted()) {
|
||||
echo "HTML content: " . substr($crawlStatus->getResultRaw(), 0, 100) . "...\n";
|
||||
echo "Download URL: " . $crawlStatus->getResultUrl() . "\n";
|
||||
|
||||
// Direct download
|
||||
$htmlContent = $crawlStatus->downloadHtml();
|
||||
} elseif ($crawlStatus->isFailed()) {
|
||||
echo "Error: " . $crawlStatus->getError() . "\n";
|
||||
} else {
|
||||
echo "Still processing...\n";
|
||||
|
||||
// Refresh status
|
||||
$crawlStatus->refresh();
|
||||
}
|
||||
|
||||
// Example 3: Screenshot with fluent interface
|
||||
$shotJob = $client->shot('https://example.com')
|
||||
->webhookUrl('https://myapp.com/webhook')
|
||||
->webhookEventsFilter(['completed'])
|
||||
->viewportSize(1920, 1080)
|
||||
->quality(90)
|
||||
->create();
|
||||
|
||||
$shotStatus = $client->getShotStatus($shotJob->getUuid());
|
||||
|
||||
if ($shotStatus->isCompleted()) {
|
||||
echo "Image format: " . $shotStatus->getFormat() . "\n";
|
||||
echo "Dimensions: " . implode('x', $shotStatus->getDimensions()) . "\n";
|
||||
echo "File size: " . $shotStatus->getSize() . " bytes\n";
|
||||
|
||||
// Get base64 data or download directly
|
||||
$imageData = $shotStatus->getImageData();
|
||||
$imageFile = $shotStatus->downloadImage();
|
||||
}
|
||||
|
||||
// Example 4: Webhook error management
|
||||
$errors = $client->listWebhookErrors();
|
||||
foreach ($errors['jobs'] as $errorJob) {
|
||||
echo "Failed webhook for job: " . $errorJob['uuid'] . "\n";
|
||||
|
||||
// Retry or clear
|
||||
$client->retryWebhook($errorJob['uuid']);
|
||||
// OR
|
||||
// $client->clearWebhookError($errorJob['uuid']);
|
||||
}
|
||||
|
||||
// Example 5: Raw response access (backward compatibility)
|
||||
$rawResponse = $crawlStatus->getRawResponse();
|
||||
echo "Raw response: " . json_encode($rawResponse, JSON_PRETTY_PRINT) . "\n";
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use App\Http\Controllers\Api\CrawlController;
|
||||
use App\Http\Controllers\Api\ShotController;
|
||||
use App\Http\Controllers\Api\WebhookErrorController;
|
||||
use Illuminate\Http\Request;
|
||||
use Illuminate\Support\Facades\Route;
|
||||
|
||||
@@ -37,6 +38,13 @@
|
||||
Route::get('/', [ShotController::class, 'index'])->name('api.shot.index'); // Optional: list all screenshot jobs
|
||||
});
|
||||
|
||||
// Webhook error endpoints
|
||||
Route::prefix('webhook-errors')->group(function () {
|
||||
Route::get('/', [WebhookErrorController::class, 'index'])->name('api.webhook-errors.index');
|
||||
Route::post('/{uuid}/retry', [WebhookErrorController::class, 'retry'])->name('api.webhook-errors.retry');
|
||||
Route::delete('/{uuid}/clear', [WebhookErrorController::class, 'clear'])->name('api.webhook-errors.clear');
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
// Health check endpoint (no auth required)
|
||||
|
||||
70
src/CrawlJobBuilder.php
Normal file
70
src/CrawlJobBuilder.php
Normal file
@@ -0,0 +1,70 @@
|
||||
<?php
|
||||
|
||||
namespace Crawlshot\Laravel;
|
||||
|
||||
class CrawlJobBuilder
|
||||
{
|
||||
protected CrawlshotClient $client;
|
||||
protected string $url;
|
||||
protected array $options = [];
|
||||
|
||||
public function __construct(CrawlshotClient $client, string $url)
|
||||
{
|
||||
$this->client = $client;
|
||||
$this->url = $url;
|
||||
}
|
||||
|
||||
public function webhookUrl(string $webhookUrl): self
|
||||
{
|
||||
$this->options['webhook_url'] = $webhookUrl;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function webhookEventsFilter(array $events): self
|
||||
{
|
||||
$this->options['webhook_events_filter'] = $events;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function timeout(int $seconds): self
|
||||
{
|
||||
$this->options['timeout'] = $seconds;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function delay(int $milliseconds): self
|
||||
{
|
||||
$this->options['delay'] = $milliseconds;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function blockAds(bool $block = true): self
|
||||
{
|
||||
$this->options['block_ads'] = $block;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function blockCookieBanners(bool $block = true): self
|
||||
{
|
||||
$this->options['block_cookie_banners'] = $block;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function blockTrackers(bool $block = true): self
|
||||
{
|
||||
$this->options['block_trackers'] = $block;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function waitUntilNetworkIdle(bool $wait = true): self
|
||||
{
|
||||
$this->options['wait_until_network_idle'] = $wait;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function create(): CrawlResponse
|
||||
{
|
||||
$data = $this->client->createCrawl($this->url, $this->options);
|
||||
return new CrawlResponse($data, $this->client);
|
||||
}
|
||||
}
|
||||
35
src/CrawlResponse.php
Normal file
35
src/CrawlResponse.php
Normal file
@@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
namespace Crawlshot\Laravel;
|
||||
|
||||
class CrawlResponse extends JobResponse
|
||||
{
|
||||
public function getResult(): ?array
|
||||
{
|
||||
return $this->isCompleted() ? ($this->data['result'] ?? null) : null;
|
||||
}
|
||||
|
||||
public function getResultRaw(): ?string
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['html']['raw'] ?? null;
|
||||
}
|
||||
|
||||
public function getResultUrl(): ?string
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['html']['url'] ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download the HTML content directly
|
||||
*/
|
||||
public function downloadHtml(): ?string
|
||||
{
|
||||
if (!$this->isCompleted()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->client->downloadCrawl($this->getUuid());
|
||||
}
|
||||
}
|
||||
@@ -26,9 +26,10 @@ public function createCrawl(string $url, array $options = []): array
|
||||
/**
|
||||
* GET /api/crawl/{uuid} - Get crawl status
|
||||
*/
|
||||
public function getCrawlStatus(string $uuid): array
|
||||
public function getCrawlStatus(string $uuid): CrawlResponse
|
||||
{
|
||||
return $this->makeRequest('GET', "/api/crawl/{$uuid}");
|
||||
$data = $this->makeRequest('GET', "/api/crawl/{$uuid}");
|
||||
return new CrawlResponse($data, $this);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -50,9 +51,10 @@ public function createShot(string $url, array $options = []): array
|
||||
/**
|
||||
* GET /api/shot/{uuid} - Get screenshot status
|
||||
*/
|
||||
public function getShotStatus(string $uuid): array
|
||||
public function getShotStatus(string $uuid): ShotResponse
|
||||
{
|
||||
return $this->makeRequest('GET', "/api/shot/{$uuid}");
|
||||
$data = $this->makeRequest('GET', "/api/shot/{$uuid}");
|
||||
return new ShotResponse($data, $this);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -99,10 +101,50 @@ public function health(): array
|
||||
return $this->makeRequest('GET', '/api/health', [], false);
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/webhook-errors - List webhook failures
|
||||
*/
|
||||
public function listWebhookErrors(): array
|
||||
{
|
||||
return $this->makeRequest('GET', '/api/webhook-errors');
|
||||
}
|
||||
|
||||
/**
|
||||
* POST /api/webhook-errors/{uuid}/retry - Retry webhook
|
||||
*/
|
||||
public function retryWebhook(string $uuid): array
|
||||
{
|
||||
return $this->makeRequest('POST', "/api/webhook-errors/{$uuid}/retry");
|
||||
}
|
||||
|
||||
/**
|
||||
* DELETE /api/webhook-errors/{uuid}/clear - Clear webhook error
|
||||
*/
|
||||
public function clearWebhookError(string $uuid): array
|
||||
{
|
||||
return $this->makeRequest('DELETE', "/api/webhook-errors/{$uuid}/clear");
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a crawl job builder
|
||||
*/
|
||||
public function crawl(string $url): CrawlJobBuilder
|
||||
{
|
||||
return new CrawlJobBuilder($this, $url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a shot job builder
|
||||
*/
|
||||
public function shot(string $url): ShotJobBuilder
|
||||
{
|
||||
return new ShotJobBuilder($this, $url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make HTTP request to API
|
||||
*/
|
||||
protected function makeRequest(string $method, string $endpoint, array $data = [], bool $requiresAuth = true): array
|
||||
public function makeRequest(string $method, string $endpoint, array $data = [], bool $requiresAuth = true): array
|
||||
{
|
||||
$http = Http::when($requiresAuth && $this->token, function ($http) {
|
||||
return $http->withToken($this->token);
|
||||
@@ -111,6 +153,7 @@ protected function makeRequest(string $method, string $endpoint, array $data = [
|
||||
$response = match (strtoupper($method)) {
|
||||
'GET' => $http->get($this->baseUrl . $endpoint),
|
||||
'POST' => $http->post($this->baseUrl . $endpoint, $data),
|
||||
'DELETE' => $http->delete($this->baseUrl . $endpoint),
|
||||
default => throw new \InvalidArgumentException("Unsupported HTTP method: {$method}")
|
||||
};
|
||||
|
||||
|
||||
97
src/JobResponse.php
Normal file
97
src/JobResponse.php
Normal file
@@ -0,0 +1,97 @@
|
||||
<?php
|
||||
|
||||
namespace Crawlshot\Laravel;
|
||||
|
||||
abstract class JobResponse
|
||||
{
|
||||
protected array $data;
|
||||
protected CrawlshotClient $client;
|
||||
|
||||
public function __construct(array $data, CrawlshotClient $client)
|
||||
{
|
||||
$this->data = $data;
|
||||
$this->client = $client;
|
||||
}
|
||||
|
||||
public function getUuid(): string
|
||||
{
|
||||
return $this->data['uuid'];
|
||||
}
|
||||
|
||||
public function getStatus(): string
|
||||
{
|
||||
return $this->data['status'];
|
||||
}
|
||||
|
||||
public function getUrl(): string
|
||||
{
|
||||
return $this->data['url'];
|
||||
}
|
||||
|
||||
public function getCreatedAt(): \DateTime
|
||||
{
|
||||
return new \DateTime($this->data['created_at']);
|
||||
}
|
||||
|
||||
public function getStartedAt(): ?\DateTime
|
||||
{
|
||||
return isset($this->data['started_at']) ? new \DateTime($this->data['started_at']) : null;
|
||||
}
|
||||
|
||||
public function getCompletedAt(): ?\DateTime
|
||||
{
|
||||
return isset($this->data['completed_at']) ? new \DateTime($this->data['completed_at']) : null;
|
||||
}
|
||||
|
||||
public function getError(): ?string
|
||||
{
|
||||
return $this->data['error'] ?? null;
|
||||
}
|
||||
|
||||
public function isCompleted(): bool
|
||||
{
|
||||
return $this->data['status'] === 'completed';
|
||||
}
|
||||
|
||||
public function isFailed(): bool
|
||||
{
|
||||
return $this->data['status'] === 'failed';
|
||||
}
|
||||
|
||||
public function isProcessing(): bool
|
||||
{
|
||||
return $this->data['status'] === 'processing';
|
||||
}
|
||||
|
||||
public function isQueued(): bool
|
||||
{
|
||||
return $this->data['status'] === 'queued';
|
||||
}
|
||||
|
||||
public function getRawResponse(): array
|
||||
{
|
||||
return $this->data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Refresh the job status from the API
|
||||
*/
|
||||
public function refresh(): static
|
||||
{
|
||||
if ($this instanceof CrawlResponse) {
|
||||
$newData = $this->client->makeRequest('GET', "/api/crawl/{$this->getUuid()}");
|
||||
} elseif ($this instanceof ShotResponse) {
|
||||
$newData = $this->client->makeRequest('GET', "/api/shot/{$this->getUuid()}");
|
||||
} else {
|
||||
throw new \RuntimeException('Unknown response type');
|
||||
}
|
||||
|
||||
$this->data = $newData;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the result data if completed, otherwise return null
|
||||
*/
|
||||
abstract public function getResult(): ?array;
|
||||
}
|
||||
77
src/ShotJobBuilder.php
Normal file
77
src/ShotJobBuilder.php
Normal file
@@ -0,0 +1,77 @@
|
||||
<?php
|
||||
|
||||
namespace Crawlshot\Laravel;
|
||||
|
||||
class ShotJobBuilder
|
||||
{
|
||||
protected CrawlshotClient $client;
|
||||
protected string $url;
|
||||
protected array $options = [];
|
||||
|
||||
public function __construct(CrawlshotClient $client, string $url)
|
||||
{
|
||||
$this->client = $client;
|
||||
$this->url = $url;
|
||||
}
|
||||
|
||||
public function webhookUrl(string $webhookUrl): self
|
||||
{
|
||||
$this->options['webhook_url'] = $webhookUrl;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function webhookEventsFilter(array $events): self
|
||||
{
|
||||
$this->options['webhook_events_filter'] = $events;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function viewportSize(int $width, int $height): self
|
||||
{
|
||||
$this->options['viewport_width'] = $width;
|
||||
$this->options['viewport_height'] = $height;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function quality(int $quality): self
|
||||
{
|
||||
$this->options['quality'] = $quality;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function timeout(int $seconds): self
|
||||
{
|
||||
$this->options['timeout'] = $seconds;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function delay(int $milliseconds): self
|
||||
{
|
||||
$this->options['delay'] = $milliseconds;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function blockAds(bool $block = true): self
|
||||
{
|
||||
$this->options['block_ads'] = $block;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function blockCookieBanners(bool $block = true): self
|
||||
{
|
||||
$this->options['block_cookie_banners'] = $block;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function blockTrackers(bool $block = true): self
|
||||
{
|
||||
$this->options['block_trackers'] = $block;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function create(): ShotResponse
|
||||
{
|
||||
$data = $this->client->createShot($this->url, $this->options);
|
||||
return new ShotResponse($data, $this->client);
|
||||
}
|
||||
}
|
||||
80
src/ShotResponse.php
Normal file
80
src/ShotResponse.php
Normal file
@@ -0,0 +1,80 @@
|
||||
<?php
|
||||
|
||||
namespace Crawlshot\Laravel;
|
||||
|
||||
class ShotResponse extends JobResponse
|
||||
{
|
||||
public function getResult(): ?array
|
||||
{
|
||||
return $this->isCompleted() ? ($this->data['result'] ?? null) : null;
|
||||
}
|
||||
|
||||
public function getImageData(): ?string
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['image']['raw'] ?? null;
|
||||
}
|
||||
|
||||
public function getImageUrl(): ?string
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['image']['url'] ?? null;
|
||||
}
|
||||
|
||||
public function getMimeType(): ?string
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['mime_type'] ?? null;
|
||||
}
|
||||
|
||||
public function getFormat(): ?string
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['format'] ?? null;
|
||||
}
|
||||
|
||||
public function getWidth(): ?int
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['width'] ?? null;
|
||||
}
|
||||
|
||||
public function getHeight(): ?int
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['height'] ?? null;
|
||||
}
|
||||
|
||||
public function getSize(): ?int
|
||||
{
|
||||
$result = $this->getResult();
|
||||
return $result['size'] ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download the image content directly
|
||||
*/
|
||||
public function downloadImage(): ?string
|
||||
{
|
||||
if (!$this->isCompleted()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $this->client->downloadShot($this->getUuid());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get image dimensions as an array [width, height]
|
||||
*/
|
||||
public function getDimensions(): ?array
|
||||
{
|
||||
$width = $this->getWidth();
|
||||
$height = $this->getHeight();
|
||||
|
||||
if ($width && $height) {
|
||||
return [$width, $height];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user