This commit is contained in:
ct
2025-08-10 21:10:33 +08:00
parent 480bd9055d
commit 583a804073
43 changed files with 7623 additions and 270 deletions

View File

@@ -1,8 +1,8 @@
APP_NAME=Laravel
APP_NAME=Crawlshot
APP_ENV=local
APP_KEY=
APP_DEBUG=true
APP_URL=http://localhost
APP_URL=https://crawlshot.test
APP_LOCALE=en
APP_FALLBACK_LOCALE=en

489
API_DOCUMENTATION.md Normal file
View File

@@ -0,0 +1,489 @@
# Crawlshot API Documentation
Crawlshot is a self-hosted web crawling and screenshot service built with Laravel and Spatie Browsershot. This API provides endpoints for capturing web content and generating screenshots with advanced filtering capabilities.
## Base URL
```
https://crawlshot.test
```
## Authentication
All API endpoints (except health check) require authentication using Laravel Sanctum API tokens.
### Authentication Header
```http
Authorization: Bearer {your-api-token}
```
### Example API Token
```
1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c
```
---
## Health Check
### GET `/api/health`
Check if the Crawlshot service is running and healthy.
**Authentication:** Not required
#### Request Example
```bash
curl -X GET "https://crawlshot.test/api/health" \
-H "Accept: application/json"
```
#### Response Example
```json
{
"status": "healthy",
"timestamp": "2025-08-10T09:54:52.195383Z",
"service": "crawlshot"
}
```
---
## Web Crawling APIs
### POST `/api/crawl`
Initiate a web crawling job to extract HTML content from a URL.
**Authentication:** Required
#### Request Parameters
| Parameter | Type | Required | Default | Description |
|-----------|------|----------|---------|-------------|
| `url` | string | ✅ | - | Target URL to crawl (max 2048 chars) |
| `timeout` | integer | ❌ | 30 | Request timeout in seconds (5-300) |
| `delay` | integer | ❌ | 0 | Wait time before capture in milliseconds (0-30000) |
| `block_ads` | boolean | ❌ | true | Block ads using EasyList filters |
| `block_cookie_banners` | boolean | ❌ | true | Block cookie consent banners |
| `block_trackers` | boolean | ❌ | true | Block tracking scripts |
| `wait_until_network_idle` | boolean | ❌ | false | Wait for network activity to cease |
#### Request Example
```bash
curl -X POST "https://crawlshot.test/api/crawl" \
-H "Accept: application/json" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c" \
-d '{
"url": "https://example.com",
"timeout": 30,
"delay": 2000,
"block_ads": true,
"block_cookie_banners": true,
"block_trackers": true,
"wait_until_network_idle": true
}'
```
#### Response Example
```json
{
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"status": "queued",
"message": "Crawl job initiated successfully"
}
```
---
### GET `/api/crawl/{uuid}`
Check the status and retrieve results of a crawl job.
**Authentication:** Required
#### Path Parameters
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `uuid` | string | ✅ | Job UUID returned from crawl initiation |
#### Request Example
```bash
curl -X GET "https://crawlshot.test/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb" \
-H "Accept: application/json" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
```
#### Response Examples
**Queued Status:**
```json
{
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"status": "queued",
"url": "https://example.com",
"created_at": "2025-08-10T10:00:42.000000Z"
}
```
**Processing Status:**
```json
{
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"status": "processing",
"url": "https://example.com",
"created_at": "2025-08-10T10:00:42.000000Z",
"started_at": "2025-08-10T10:00:45.000000Z"
}
```
**Completed Status:**
```json
{
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"status": "completed",
"url": "https://example.com",
"created_at": "2025-08-10T10:00:42.000000Z",
"started_at": "2025-08-10T10:00:45.000000Z",
"completed_at": "2025-08-10T10:01:12.000000Z",
"result": "<!doctype html>\n<html>\n<head>\n <title>Example Domain</title>\n</head>\n<body>\n <h1>Example Domain</h1>\n <p>This domain is for use in illustrative examples...</p>\n</body>\n</html>"
}
```
**Failed Status:**
```json
{
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"status": "failed",
"url": "https://example.com",
"created_at": "2025-08-10T10:00:42.000000Z",
"started_at": "2025-08-10T10:00:45.000000Z",
"completed_at": "2025-08-10T10:00:50.000000Z",
"error": "Timeout: Navigation failed after 30 seconds"
}
```
---
### GET `/api/crawl`
List all crawl jobs with pagination (optional endpoint for debugging).
**Authentication:** Required
#### Request Example
```bash
curl -X GET "https://crawlshot.test/api/crawl" \
-H "Accept: application/json" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
```
#### Response Example
```json
{
"jobs": [
{
"uuid": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"type": "crawl",
"url": "https://example.com",
"status": "completed",
"created_at": "2025-08-10T10:00:42.000000Z",
"completed_at": "2025-08-10T10:01:12.000000Z"
}
],
"pagination": {
"current_page": 1,
"total_pages": 5,
"total_items": 100,
"per_page": 20
}
}
```
---
## Screenshot APIs
### POST `/api/shot`
Initiate a screenshot job to capture an image of a webpage.
**Authentication:** Required
#### Request Parameters
| Parameter | Type | Required | Default | Description |
|-----------|------|----------|---------|-------------|
| `url` | string | ✅ | - | Target URL to screenshot (max 2048 chars) |
| `viewport_width` | integer | ❌ | 1920 | Viewport width in pixels (320-3840) |
| `viewport_height` | integer | ❌ | 1080 | Viewport height in pixels (240-2160) |
| `format` | string | ❌ | "jpg" | Image format: "jpg", "png", "webp" |
| `quality` | integer | ❌ | 90 | Image quality 1-100 (for JPEG/WebP) |
| `timeout` | integer | ❌ | 30 | Request timeout in seconds (5-300) |
| `delay` | integer | ❌ | 0 | Wait time before capture in milliseconds (0-30000) |
| `block_ads` | boolean | ❌ | true | Block ads using EasyList filters |
| `block_cookie_banners` | boolean | ❌ | true | Block cookie consent banners |
| `block_trackers` | boolean | ❌ | true | Block tracking scripts |
#### Request Example
```bash
curl -X POST "https://crawlshot.test/api/shot" \
-H "Accept: application/json" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c" \
-d '{
"url": "https://example.com",
"viewport_width": 1920,
"viewport_height": 1080,
"format": "webp",
"quality": 90,
"timeout": 30,
"delay": 2000,
"block_ads": true,
"block_cookie_banners": true,
"block_trackers": true
}'
```
#### Response Example
```json
{
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
"status": "queued",
"message": "Screenshot job initiated successfully"
}
```
---
### GET `/api/shot/{uuid}`
Check the status and retrieve results of a screenshot job. When completed, returns base64 image data and download URL.
**Authentication:** Required
#### Path Parameters
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `uuid` | string | ✅ | Job UUID returned from screenshot initiation |
#### Request Example
```bash
curl -X GET "https://crawlshot.test/api/shot/fe37d511-99cb-4295-853b-6d484900a851" \
-H "Accept: application/json" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
```
#### Response Examples
**Queued Status:**
```json
{
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
"status": "queued",
"url": "https://example.com",
"created_at": "2025-08-10T10:05:42.000000Z"
}
```
**Processing Status:**
```json
{
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
"status": "processing",
"url": "https://example.com",
"created_at": "2025-08-10T10:05:42.000000Z",
"started_at": "2025-08-10T10:05:45.000000Z"
}
```
**Completed Status:**
```json
{
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
"status": "completed",
"url": "https://example.com",
"created_at": "2025-08-10T10:05:42.000000Z",
"started_at": "2025-08-10T10:05:45.000000Z",
"completed_at": "2025-08-10T10:06:12.000000Z",
"result": {
"image_data": "iVBORw0KGgoAAAANSUhEUgAAAHgAAAAyCAYAAACXpx/Y...",
"download_url": "https://crawlshot.test/api/shot/fe37d511-99cb-4295-853b-6d484900a851/download",
"mime_type": "image/webp",
"format": "webp",
"width": 1920,
"height": 1080,
"size": 45678
}
}
```
**Failed Status:**
```json
{
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
"status": "failed",
"url": "https://example.com",
"created_at": "2025-08-10T10:05:42.000000Z",
"started_at": "2025-08-10T10:05:45.000000Z",
"completed_at": "2025-08-10T10:05:50.000000Z",
"error": "Timeout: Navigation failed after 30 seconds"
}
```
---
### GET `/api/shot/{uuid}/download`
Download the screenshot file directly. Returns the actual image file with appropriate headers.
**Authentication:** Required
#### Path Parameters
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `uuid` | string | ✅ | Job UUID of a completed screenshot job |
#### Request Example
```bash
curl -X GET "https://crawlshot.test/api/shot/fe37d511-99cb-4295-853b-6d484900a851/download" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c" \
--output screenshot.webp
```
#### Response
Returns the image file directly with appropriate `Content-Type` headers:
- `Content-Type: image/jpeg` for JPEG files
- `Content-Type: image/png` for PNG files
- `Content-Type: image/webp` for WebP files
---
### GET `/api/shot`
List all screenshot jobs with pagination (optional endpoint for debugging).
**Authentication:** Required
#### Request Example
```bash
curl -X GET "https://crawlshot.test/api/shot" \
-H "Accept: application/json" \
-H "Authorization: Bearer 1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c"
```
#### Response Example
```json
{
"jobs": [
{
"uuid": "fe37d511-99cb-4295-853b-6d484900a851",
"type": "shot",
"url": "https://example.com",
"status": "completed",
"created_at": "2025-08-10T10:05:42.000000Z",
"completed_at": "2025-08-10T10:06:12.000000Z"
}
],
"pagination": {
"current_page": 1,
"total_pages": 3,
"total_items": 50,
"per_page": 20
}
}
```
---
## Job Status Flow
Both crawl and screenshot jobs follow the same status progression:
1. **`queued`** - Job created and waiting for processing
2. **`processing`** - Job is currently being executed by a worker
3. **`completed`** - Job finished successfully, results available
4. **`failed`** - Job encountered an error and could not complete
## Error Responses
### 401 Unauthorized
```json
{
"message": "Unauthenticated."
}
```
### 404 Not Found
```json
{
"error": "Job not found"
}
```
### 422 Validation Error
```json
{
"message": "The given data was invalid.",
"errors": {
"url": [
"The url field is required."
],
"timeout": [
"The timeout must be between 5 and 300."
]
}
}
```
## Features
### Ad & Tracker Blocking
- **EasyList Integration**: Automatically downloads and applies EasyList filters
- **Cookie Banner Blocking**: Removes cookie consent prompts
- **Tracker Blocking**: Blocks Google Analytics, Facebook Pixel, and other tracking scripts
- **Custom Domain Blocking**: Blocks common advertising and tracking domains
### Image Processing
- **Multiple Formats**: Support for JPEG, PNG, and WebP
- **Quality Control**: Adjustable compression quality (1-100)
- **Imagick Integration**: High-quality image processing and format conversion
- **Responsive Sizing**: Custom viewport dimensions up to 4K resolution
### Storage & Cleanup
- **24-Hour TTL**: All files automatically deleted after 24 hours
- **Scheduled Cleanup**: Daily automated cleanup of expired files
- **Manual Cleanup**: `php artisan crawlshot:prune-storage` command available
### Performance
- **Background Processing**: All jobs processed asynchronously via Laravel Horizon
- **Queue Management**: Built-in retry logic and failure handling
- **Caching**: EasyList filters cached for optimal performance
- **Monitoring**: Horizon dashboard for real-time job monitoring at `/horizon`
## Rate Limiting
API endpoints include rate limiting to prevent abuse. Contact your system administrator for current rate limit settings.
## Support
For technical support or questions about the Crawlshot API, please refer to the system documentation or contact your administrator.

262
CLAUDE.md Normal file
View File

@@ -0,0 +1,262 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
Crawlshot is a self-hosted API service built on Laravel 12 that provides web crawling and screenshot capabilities using Spatie Browsershot. It's designed as a self-hosted alternative to services like ScreenshotOne, offering browser automation through a REST API with authentication and job processing.
### Core Features
- **Web Crawling**: HTML extraction using headless Chrome via Spatie Browsershot
- **Screenshots**: Image capture using Imagick with customizable dimensions
- **Ad/Tracker Blocking**: Built-in blocking of ads, cookie banners, and trackers
- **Authentication**: Laravel Sanctum API token authentication
- **Job Processing**: Laravel Horizon for background job management
- **Temporary Storage**: 24-hour auto-deletion of crawl results
- **Status Tracking**: UUID-based job status monitoring
### Technology Stack
- **Backend**: PHP 8.3+ with Laravel 12 framework
- **Browser Automation**: Spatie Browsershot (Puppeteer/Chrome headless)
- **Queue System**: Laravel Horizon for job processing
- **Authentication**: Laravel Sanctum for API tokens
- **Testing**: Pest PHP testing framework
- **Database**: SQLite (development) for job tracking and API tokens
## API Endpoints
### Core API Routes
```
POST /api/crawl
- Initiates crawling/screenshot job
- Parameters: url, type (html|image), width, height, timeout
- Returns: {"uuid": "job-uuid", "status": "queued"}
GET /api/crawl/{uuid}
- Checks job status and retrieves results
- Returns: {"status": "processing|completed|failed", "result": "html content or image url"}
```
### Supported Parameters (mapped to Browsershot capabilities)
**HTML Crawling**:
- `url`: Target URL to crawl
- `timeout`: Request timeout in seconds (via `timeout()` method)
- `block_ads`: true/false - Uses EasyList filter (https://easylist.to/easylist/easylist.txt)
- `block_cookie_banners`: true/false - Uses cookie banner blocking patterns
- `block_trackers`: true/false - Uses tracker blocking patterns
- `delay`: Wait time before capture in milliseconds (via `setDelay()`)
- `wait_until_network_idle`: Wait for network activity to cease (via `waitUntilNetworkIdle()`)
**Screenshot Capture**:
- `url`: Target URL to screenshot
- `viewport_width`: Viewport width (via `windowSize()` method)
- `viewport_height`: Viewport height (via `windowSize()` method)
- `format`: jpg, png, webp (via Imagick post-processing)
- `quality`: Image quality 1-100 for JPEG (via `setScreenshotType('jpeg', quality)`)
- `block_ads`: true/false - Uses EasyList filter for ad blocking
- `block_cookie_banners`: true/false - Uses cookie banner blocking patterns
- `block_trackers`: true/false - Uses tracker blocking patterns
- `timeout`: Request timeout in seconds (via `timeout()` method)
- `delay`: Wait time before capture in milliseconds (via `setDelay()`)
## Development Commands
### Starting the Development Environment
User will start the development, do not start yourself, prompt the user to start instead
### Queue Management with Horizon
User will star the horizon, do not start yourself, prompt the user to start instead
# Horizon dashboard available at: /horizon
# Monitor job queues, failed jobs, and metrics
````
### Individual Services
Do not start them yourself, prompt the user to start instead
### Testing
```bash
# Run all tests using Pest
composer run test
# Run API endpoint tests
php artisan test --filter=Api
# Test browsershot functionality
php artisan test tests/Feature/BrowsershotTest.php
````
### Database Operations
Never run database migrations yourself, prompt the user to run instead
### API Token Management
```bash
# Generate API tokens via Tinker
php artisan tinker
# User::find(1)->createToken('client-name')->plainTextToken
# Prune expired tokens
php artisan sanctum:prune-expired --hours=24
```
### Storage Management
```bash
# Prune expired crawl results (HTML and images older than 24 hours)
php artisan crawlshot:prune-storage
# Run storage cleanup via scheduled job
php artisan schedule:run
```
### Browsershot Setup Requirements
```bash
# Install Node.js and Puppeteer dependencies
npm install puppeteer
# For production servers, ensure Chrome/Chromium is installed
# Ubuntu/Debian: apt-get install chromium-browser
# Alpine: apk add chromium
# Or use Puppeteer's bundled Chromium
```
## Architecture Overview
### Job Processing Flow
1. **Crawl API Request**`/api/crawl` with URL and parameters
2. ** Screenshot API Request** → `/api/shot` with URL and parameters
3. **Job Creation** → Queue job with UUID, store in database
4. **Processing** → Horizon worker uses Browsershot to capture content
5. **Storage** → Save HTML/image to storage with 24h expiry
6. **Status Check**`/api/crawl/{uuid}` returns result when ready
### Directory Structure
```
app/
├── Http/Controllers/Api/
│ └── CrawlController.php # Main API endpoints (/crawl, /crawl/{uuid})
│ └── ShotController.php # Main API endpoints (/shot, /shot/{uuid})
├── Jobs/
│ ├── ProcessCrawlShotJob.php # Browsershot integration
│ └── CleanupOldResults.php # Auto-delete expired files
├── Models/
│ ├── CrawlShotJob.php # Job tracking model
│ └── User.php # API token authentication
└── Services/
├── BrowsershotService.php # Browsershot wrapper with filtering
└── EasyListService.php # ProtonMail php-adblock-parser wrapper
storage/app/crawlshot/ # Temporary result storage (24h TTL)
├── html/ # HTML crawl results
└── images/ # Screenshot files (JPEG/PNG/WebP)
routes/
└── api.php # /crawl endpoints with Sanctum auth
```
### Browsershot Configuration
```php
// Basic screenshot configuration with EasyList ad blocking
$browsershot = Browsershot::url($url)
->windowSize($width, $height)
->setScreenshotType('png') // Save as PNG first for Imagick processing
->setDelay($delayInMs)
->waitUntilNetworkIdle()
->timeout($timeoutInSeconds);
// Apply EasyList filters if block_ads is true
if ($blockAds) {
$blockedDomains = EasyListService::getBlockedDomains($url);
$blockedUrls = EasyListService::getBlockedUrls($url);
$browsershot->blockDomains($blockedDomains)->blockUrls($blockedUrls);
}
$tempPath = storage_path('temp_screenshot.png');
$browsershot->save($tempPath);
// Convert to desired format using Imagick if needed
if ($format === 'webp') {
$imagick = new Imagick($tempPath);
$imagick->setImageFormat('webp');
$imagick->writeImage($finalPath);
unlink($tempPath);
}
// HTML crawling configuration with EasyList filtering
$browsershot = Browsershot::url($url)
->setDelay($delayInMs)
->waitUntilNetworkIdle()
->timeout($timeoutInSeconds);
// Apply EasyList filters if block_ads is true
if ($blockAds) {
$blockedDomains = EasyListService::getBlockedDomains($url);
$blockedUrls = EasyListService::getBlockedUrls($url);
$browsershot->blockDomains($blockedDomains)->blockUrls($blockedUrls);
}
$html = $browsershot->bodyHtml();
```
### Job States
- **queued**: Job created, waiting for processing
- **processing**: Horizon worker running Browsershot
- **completed**: Result stored, available via status endpoint
- **failed**: Browsershot error, timeout, or invalid URL
### Storage Strategy
- HTML results: `storage/app/crawlshot/html/{uuid}.html`
- Image results: `storage/app/crawlshot/images/{uuid}.jpg`, `.png`, or `.webp`
- Auto-cleanup scheduled job removes files after 24 hours
- Database tracks job metadata and file paths
### Authentication & Security
- All API endpoints protected by Sanctum middleware
- Bearer token required in Authorization header
- Rate limiting on crawl endpoints to prevent abuse
- Input validation for URLs and parameters
### System Requirements
- PHP 8.3+ with extensions: gd, imagick (required for WebP format)
- Node.js and npm for Puppeteer
- Chrome/Chromium browser (headless)
- Sufficient disk space for temporary file storage
- Memory for concurrent Browsershot processes
### EasyList Integration
- Uses ProtonMail's php-adblock-parser (https://github.com/ProtonMail/php-adblock-parser)
- Service downloads and caches EasyList filters from https://easylist.to/easylist/easylist.txt
- php-adblock-parser handles filter parsing and URL matching
- Filters converted to domains/URLs for `blockDomains()` and `blockUrls()` methods
- Cache updated periodically to maintain current ad blocking effectiveness
- Cookie banner and tracker blocking use additional filter lists (EasyList Cookie, Fanboy's Annoyance)
### Development Notes
- Horizon required for proper queue processing
- Chrome/Chromium must be accessible to PHP process
- Consider Docker for consistent browser environment
- Monitor disk usage due to temporary file storage
- EasyList filters cached locally for performance using php-adblock-parser
- Test with various websites for ad/tracker blocking effectiveness

View File

@@ -0,0 +1,19 @@
meta {
name: Health Check
type: http
seq: 1
}
get {
url: {{base_url}}/api/health
body: none
auth: bearer
}
headers {
Accept: application/json
}
auth:bearer {
token: {{api_token}}
}

View File

@@ -0,0 +1,15 @@
meta {
name: Download Screenshot File
type: http
seq: 3
}
get {
url: {{base_url}}/api/shot/:uuid/download
body: none
auth: bearer
}
auth:bearer {
token: {{api_token}}
}

View File

@@ -0,0 +1,19 @@
meta {
name: Get Screenshot Status - Results
type: http
seq: 2
}
get {
url: {{base_url}}/api/shot/:uuid
body: none
auth: bearer
}
headers {
Accept: application/json
}
auth:bearer {
token: {{api_token}}
}

View File

@@ -0,0 +1,35 @@
meta {
name: Initiate Screenshot Job
type: http
seq: 1
}
post {
url: {{base_url}}/api/shot
body: json
auth: bearer
}
headers {
Accept: application/json
Content-Type: application/json
}
auth:bearer {
token: {{api_token}}
}
body:json {
{
"url": "https://example.com",
"viewport_width": 1920,
"viewport_height": 1080,
"format": "webp",
"quality": 90,
"timeout": 30,
"delay": 2000,
"block_ads": true,
"block_cookie_banners": true,
"block_trackers": true
}
}

View File

@@ -0,0 +1,19 @@
meta {
name: List Screenshot Jobs
type: http
seq: 4
}
get {
url: {{base_url}}/api/shot
body: none
auth: bearer
}
headers {
Accept: application/json
}
auth:bearer {
token: {{api_token}}
}

View File

@@ -0,0 +1,19 @@
meta {
name: Get Crawl Status - Results
type: http
seq: 2
}
get {
url: {{base_url}}/api/crawl/:uuid
body: none
auth: bearer
}
headers {
Accept: application/json
}
auth:bearer {
token: {{api_token}}
}

View File

@@ -0,0 +1,32 @@
meta {
name: Initiate Crawl Job
type: http
seq: 1
}
post {
url: {{base_url}}/api/crawl
body: json
auth: bearer
}
headers {
Accept: application/json
Content-Type: application/json
}
auth:bearer {
token: {{api_token}}
}
body:json {
{
"url": "https://example.com",
"timeout": 30,
"delay": 2000,
"block_ads": true,
"block_cookie_banners": true,
"block_trackers": true,
"wait_until_network_idle": true
}
}

View File

@@ -0,0 +1,19 @@
meta {
name: List Crawl Jobs
type: http
seq: 3
}
get {
url: {{base_url}}/api/crawl
body: none
auth: bearer
}
headers {
Accept: application/json
}
auth:bearer {
token: {{api_token}}
}

9
Crawlshot API/bruno.json Normal file
View File

@@ -0,0 +1,9 @@
{
"version": "1",
"name": "Crawlshot API",
"type": "collection",
"ignore": [
"node_modules",
".git"
]
}

View File

@@ -0,0 +1,6 @@
vars {
base_url: https://crawlshot.test
}
vars:secret [
api_token
]

215
README.md
View File

@@ -1,61 +1,198 @@
<p align="center"><a href="https://laravel.com" target="_blank"><img src="https://raw.githubusercontent.com/laravel/art/master/logo-lockup/5%20SVG/2%20CMYK/1%20Full%20Color/laravel-logolockup-cmyk-red.svg" width="400" alt="Laravel Logo"></a></p>
# Crawlshot
<p align="center">
<a href="https://github.com/laravel/framework/actions"><img src="https://github.com/laravel/framework/workflows/tests/badge.svg" alt="Build Status"></a>
<a href="https://packagist.org/packages/laravel/framework"><img src="https://img.shields.io/packagist/dt/laravel/framework" alt="Total Downloads"></a>
<a href="https://packagist.org/packages/laravel/framework"><img src="https://img.shields.io/packagist/v/laravel/framework" alt="Latest Stable Version"></a>
<a href="https://packagist.org/packages/laravel/framework"><img src="https://img.shields.io/packagist/l/laravel/framework" alt="License"></a>
</p>
A Laravel web crawling and screenshot service with dual deployment options:
## About Laravel
1. **Standalone API Service** - Full Laravel application with REST API endpoints
2. **Laravel Package** - HTTP client package for use in other Laravel applications
Laravel is a web application framework with expressive, elegant syntax. We believe development must be an enjoyable and creative experience to be truly fulfilling. Laravel takes the pain out of development by easing common tasks used in many web projects, such as:
## Architecture Overview
- [Simple, fast routing engine](https://laravel.com/docs/routing).
- [Powerful dependency injection container](https://laravel.com/docs/container).
- Multiple back-ends for [session](https://laravel.com/docs/session) and [cache](https://laravel.com/docs/cache) storage.
- Expressive, intuitive [database ORM](https://laravel.com/docs/eloquent).
- Database agnostic [schema migrations](https://laravel.com/docs/migrations).
- [Robust background job processing](https://laravel.com/docs/queues).
- [Real-time event broadcasting](https://laravel.com/docs/broadcasting).
### Standalone API Service
The main Laravel application provides a complete web crawling and screenshot service:
Laravel is accessible, powerful, and provides tools required for large, robust applications.
- **Spatie Browsershot Integration** - Uses Puppeteer for browser automation
- **EasyList Ad Blocking** - Automatic ad/tracker blocking using EasyList filters
- **Queue Processing** - Laravel Horizon for async job processing
- **24-hour Cleanup** - Automatic file and database cleanup
- **Sanctum Authentication** - API token-based authentication
- **SQLite Database** - Stores job metadata and processing status
## Learning Laravel
### Laravel Package
Simple HTTP client package that provides a clean interface to the API:
Laravel has the most extensive and thorough [documentation](https://laravel.com/docs) and video tutorial library of all modern web application frameworks, making it a breeze to get started with the framework.
- **8 Methods for 8 APIs** - Direct 1:1 mapping to REST endpoints
- **Facade Support** - Clean Laravel integration
- **Auto-discovery** - Automatic service provider registration
You may also try the [Laravel Bootcamp](https://bootcamp.laravel.com), where you will be guided through building a modern Laravel application from scratch.
## Deployment Options
If you don't feel like reading, [Laracasts](https://laracasts.com) can help. Laracasts contains thousands of video tutorials on a range of topics including Laravel, modern PHP, unit testing, and JavaScript. Boost your skills by digging into our comprehensive video library.
### Option 1: Standalone API Service
## Laravel Sponsors
Deploy as a complete Laravel application:
We would like to extend our thanks to the following sponsors for funding Laravel development. If you are interested in becoming a sponsor, please visit the [Laravel Partners program](https://partners.laravel.com).
```bash
git clone [repository]
cd crawlshot
composer install
npm install puppeteer
php artisan migrate
php artisan serve
```
### Premium Partners
**API Endpoints:**
- `POST /api/crawl` - Create HTML crawl job
- `GET /api/crawl/{uuid}` - Get crawl status/result
- `GET /api/crawl` - List all crawl jobs
- `POST /api/shot` - Create screenshot job
- `GET /api/shot/{uuid}` - Get screenshot status/result
- `GET /api/shot/{uuid}/download` - Download screenshot file
- `GET /api/shot` - List all screenshot jobs
- `GET /api/health` - Health check
- **[Vehikl](https://vehikl.com)**
- **[Tighten Co.](https://tighten.co)**
- **[Kirschbaum Development Group](https://kirschbaumdevelopment.com)**
- **[64 Robots](https://64robots.com)**
- **[Curotec](https://www.curotec.com/services/technologies/laravel)**
- **[DevSquad](https://devsquad.com/hire-laravel-developers)**
- **[Redberry](https://redberry.international/laravel-development)**
- **[Active Logic](https://activelogic.com)**
**Example API Usage:**
```bash
# Create crawl job
curl -X POST "https://crawlshot.test/api/crawl" \
-H "Authorization: Bearer {token}" \
-H "Content-Type: application/json" \
-d '{"url": "https://example.com", "block_ads": true}'
## Contributing
# Check status
curl -H "Authorization: Bearer {token}" \
"https://crawlshot.test/api/crawl/{uuid}"
```
Thank you for considering contributing to the Laravel framework! The contribution guide can be found in the [Laravel documentation](https://laravel.com/docs/contributions).
### Option 2: Laravel Package
## Code of Conduct
Install as a package in your Laravel application:
In order to ensure that the Laravel community is welcoming to all, please review and abide by the [Code of Conduct](https://laravel.com/docs/contributions#code-of-conduct).
```bash
composer require crawlshot/laravel
php artisan vendor:publish --tag=crawlshot-config
```
## Security Vulnerabilities
**Configuration:**
```env
CRAWLSHOT_BASE_URL=https://your-crawlshot-api.com
CRAWLSHOT_TOKEN=your-sanctum-token
```
If you discover a security vulnerability within Laravel, please send an e-mail to Taylor Otwell via [taylor@laravel.com](mailto:taylor@laravel.com). All security vulnerabilities will be promptly addressed.
**Package Usage:**
```php
use Crawlshot\Laravel\Facades\Crawlshot;
// Create crawl job
$response = Crawlshot::createCrawl('https://example.com', [
'block_ads' => true,
'timeout' => 30
]);
// Check status
$status = Crawlshot::getCrawlStatus($response['uuid']);
// Create screenshot
$response = Crawlshot::createShot('https://example.com', [
'format' => 'jpg',
'width' => 1920,
'height' => 1080
]);
// Download screenshot
$imageData = Crawlshot::downloadShot($response['uuid']);
file_put_contents('screenshot.jpg', $imageData);
```
## API Reference
### Available Methods (Package)
| Method | API Endpoint | Description |
|--------|--------------|-------------|
| `createCrawl(string $url, array $options = [])` | `POST /api/crawl` | Create crawl job |
| `getCrawlStatus(string $uuid)` | `GET /api/crawl/{uuid}` | Get crawl status |
| `listCrawls()` | `GET /api/crawl` | List all crawl jobs |
| `createShot(string $url, array $options = [])` | `POST /api/shot` | Create screenshot job |
| `getShotStatus(string $uuid)` | `GET /api/shot/{uuid}` | Get screenshot status |
| `downloadShot(string $uuid)` | `GET /api/shot/{uuid}/download` | Download screenshot file |
| `listShots()` | `GET /api/shot` | List all screenshot jobs |
| `health()` | `GET /api/health` | Health check |
### Crawl Options
```php
[
'block_ads' => true, // Block ads using EasyList
'block_trackers' => true, // Block tracking scripts
'timeout' => 30, // Request timeout in seconds
'user_agent' => 'Custom UA', // Custom user agent
'wait_until' => 'networkidle0' // Wait condition
]
```
### Screenshot Options
```php
[
'format' => 'jpg', // jpg, png, webp
'quality' => 90, // 1-100 for jpg/webp
'width' => 1920, // Viewport width
'height' => 1080, // Viewport height
'full_page' => true, // Capture full page
'block_ads' => true, // Block ads
'timeout' => 30 // Request timeout
]
```
## Features
### Core Functionality
- **HTML Crawling** - Extract clean HTML content from web pages
- **Screenshot Capture** - Generate high-quality screenshots (JPG, PNG, WebP)
- **Ad Blocking** - Built-in EasyList integration for ad/tracker blocking
- **Queue Processing** - Async job processing with Laravel Horizon
- **File Management** - Automatic cleanup after 24 hours
### Technical Features
- **Laravel 12** support with PHP 8.3+
- **Puppeteer Integration** via Spatie Browsershot
- **Sanctum Authentication** for API security
- **SQLite Database** with migrations
- **Auto-discovery** for package installation
- **Environment Configuration** via .env variables
## Development
### Requirements
- PHP 8.3+
- Laravel 12.0+
- Node.js with Puppeteer
- SQLite (or other database)
- ImageMagick extension
### Key Dependencies
- `spatie/browsershot` - Browser automation
- `protonlabs/php-adblock-parser` - EasyList parsing
- `laravel/horizon` - Queue monitoring (standalone)
- `laravel/sanctum` - API authentication (standalone)
### File Structure
```
├── app/ # Laravel application (standalone)
│ ├── Http/Controllers/Api/ # API controllers
│ ├── Jobs/ # Queue jobs
│ ├── Models/ # Eloquent models
│ └── Services/ # Core services
├── src/ # Package source (both modes)
│ ├── CrawlshotClient.php # HTTP client (package mode)
│ ├── CrawlshotServiceProvider.php
│ ├── Facades/Crawlshot.php
│ └── config/crawlshot.php
├── routes/api.php # API routes (standalone)
├── database/migrations/ # Database schema
└── composer.json # Package definition
```
## License
The Laravel framework is open-sourced software licensed under the [MIT license](https://opensource.org/licenses/MIT).
MIT

View File

@@ -0,0 +1,31 @@
<?php
namespace App\Console\Commands;
use App\Models\User;
use Illuminate\Console\Command;
class CreateApiToken extends Command
{
protected $signature = 'crawlshot:create-token {name=API User} {email=api@crawlshot.test}';
protected $description = 'Create an API token for Crawlshot';
public function handle()
{
$name = $this->argument('name');
$email = $this->argument('email');
$user = User::firstOrCreate(['email' => $email], [
'name' => $name,
'password' => bcrypt('password')
]);
$token = $user->createToken('crawlshot-api')->plainTextToken;
$this->info("API Token created successfully!");
$this->line("Token: {$token}");
$this->line("Use this in your Authorization header: Bearer {$token}");
return 0;
}
}

View File

@@ -0,0 +1,100 @@
<?php
namespace App\Console\Commands;
use App\Jobs\CleanupOldResults;
use App\Models\CrawlShotJob;
use Carbon\Carbon;
use Illuminate\Console\Command;
class PruneStorage extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'crawlshot:prune-storage
{--hours=24 : How many hours old files should be to be pruned}
{--dry-run : Show what would be deleted without actually deleting}';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Prune expired crawlshot HTML and image results older than specified hours (default: 24)';
/**
* Execute the console command.
*/
public function handle()
{
$hours = (int) $this->option('hours');
$dryRun = $this->option('dry-run');
$this->info("Pruning crawlshot storage files older than {$hours} hours...");
if ($dryRun) {
$this->warn('DRY RUN MODE - No files will actually be deleted');
}
// Find jobs older than specified hours
$cutoffTime = Carbon::now()->subHours($hours);
$oldJobs = CrawlShotJob::where('created_at', '<', $cutoffTime)
->whereNotNull('file_path')
->get();
if ($oldJobs->isEmpty()) {
$this->info('No files found to prune.');
return Command::SUCCESS;
}
$this->info("Found {$oldJobs->count()} files to prune:");
$deletedFiles = 0;
$deletedRecords = 0;
$errors = 0;
foreach ($oldJobs as $job) {
$this->line("- {$job->type} job {$job->uuid} ({$job->file_path})");
if (!$dryRun) {
// Delete the file if it exists
if ($job->file_path && file_exists($job->file_path)) {
if (unlink($job->file_path)) {
$deletedFiles++;
} else {
$this->error(" Failed to delete file: {$job->file_path}");
$errors++;
}
}
// Delete the database record
try {
$job->delete();
$deletedRecords++;
} catch (\Exception $e) {
$this->error(" Failed to delete database record: {$e->getMessage()}");
$errors++;
}
}
}
if (!$dryRun) {
$this->info("Cleanup completed:");
$this->line(" - Files deleted: {$deletedFiles}");
$this->line(" - Database records deleted: {$deletedRecords}");
if ($errors > 0) {
$this->error(" - Errors encountered: {$errors}");
return Command::FAILURE;
}
} else {
$this->info("Would have deleted {$oldJobs->count()} files and records");
}
return Command::SUCCESS;
}
}

View File

@@ -0,0 +1,128 @@
<?php
namespace App\Http\Controllers\Api;
use App\Http\Controllers\Controller;
use App\Models\CrawlShotJob;
use App\Jobs\ProcessCrawlShotJob;
use Illuminate\Http\Request;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Response;
use Illuminate\Support\Str;
use Illuminate\Support\Facades\Storage;
use Illuminate\Validation\Rule;
class CrawlController extends Controller
{
public function crawl(Request $request): JsonResponse
{
$validated = $request->validate([
'url' => 'required|url|max:2048',
'timeout' => 'integer|min:5|max:300',
'delay' => 'integer|min:0|max:30000',
'block_ads' => 'boolean',
'block_cookie_banners' => 'boolean',
'block_trackers' => 'boolean',
'wait_until_network_idle' => 'boolean'
]);
$uuid = Str::uuid()->toString();
$job = CrawlShotJob::create([
'uuid' => $uuid,
'type' => 'crawl',
'url' => $validated['url'],
'status' => 'queued',
'parameters' => array_filter([
'timeout' => $validated['timeout'] ?? 30,
'delay' => $validated['delay'] ?? 0,
'block_ads' => $validated['block_ads'] ?? true,
'block_cookie_banners' => $validated['block_cookie_banners'] ?? true,
'block_trackers' => $validated['block_trackers'] ?? true,
'wait_until_network_idle' => $validated['wait_until_network_idle'] ?? false
])
]);
ProcessCrawlShotJob::dispatch($uuid);
return response()->json([
'uuid' => $uuid,
'status' => 'queued',
'message' => 'Crawl job initiated successfully'
], 201);
}
public function status(string $uuid): JsonResponse
{
$job = CrawlShotJob::where('uuid', $uuid)->first();
if (!$job) {
return response()->json(['error' => 'Job not found'], 404);
}
$response = [
'uuid' => $job->uuid,
'status' => $job->status,
'url' => $job->url,
'created_at' => $job->created_at->toISOString()
];
if ($job->started_at) {
$response['started_at'] = $job->started_at->toISOString();
}
if ($job->completed_at) {
$response['completed_at'] = $job->completed_at->toISOString();
}
if ($job->status === 'completed' && $job->file_path) {
$response['result'] = [
'html' => [
'url' => url("/api/crawl/{$job->uuid}.html"),
'raw' => Storage::get($job->file_path)
]
];
}
if ($job->status === 'failed' && $job->error_message) {
$response['error'] = $job->error_message;
}
return response()->json($response);
}
public function index(): JsonResponse
{
$jobs = CrawlShotJob::where('type', 'crawl')
->orderBy('created_at', 'desc')
->paginate(20);
$response = [
'jobs' => $jobs->items(),
'pagination' => [
'current_page' => $jobs->currentPage(),
'total_pages' => $jobs->lastPage(),
'total_items' => $jobs->total(),
'per_page' => $jobs->perPage()
]
];
return response()->json($response);
}
public function serve(string $uuid): Response
{
$job = CrawlShotJob::where('uuid', $uuid)->where('type', 'crawl')->first();
if (!$job || $job->status !== 'completed') {
return response('HTML file not found or not ready', 404);
}
if (!$job->file_path || !Storage::exists($job->file_path)) {
return response('HTML file not found', 404);
}
return response(Storage::get($job->file_path))
->header('Content-Type', 'text/html; charset=utf-8');
}
}

View File

@@ -0,0 +1,151 @@
<?php
namespace App\Http\Controllers\Api;
use App\Http\Controllers\Controller;
use App\Models\CrawlShotJob;
use App\Jobs\ProcessCrawlShotJob;
use Illuminate\Http\Request;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Response;
use Illuminate\Support\Str;
use Illuminate\Support\Facades\Storage;
class ShotController extends Controller
{
public function shot(Request $request): JsonResponse
{
$validated = $request->validate([
'url' => 'required|url|max:2048',
'viewport_width' => 'integer|min:320|max:3840',
'viewport_height' => 'integer|min:240|max:2160',
'quality' => 'integer|min:1|max:100',
'timeout' => 'integer|min:5|max:300',
'delay' => 'integer|min:0|max:30000',
'block_ads' => 'boolean',
'block_cookie_banners' => 'boolean',
'block_trackers' => 'boolean'
]);
$uuid = Str::uuid()->toString();
$job = CrawlShotJob::create([
'uuid' => $uuid,
'type' => 'shot',
'url' => $validated['url'],
'status' => 'queued',
'parameters' => array_filter([
'viewport_width' => $validated['viewport_width'] ?? 1920,
'viewport_height' => $validated['viewport_height'] ?? 1080,
'format' => 'webp', // Force WebP for all screenshots
'quality' => $validated['quality'] ?? 90,
'timeout' => $validated['timeout'] ?? 30,
'delay' => $validated['delay'] ?? 0,
'block_ads' => $validated['block_ads'] ?? true,
'block_cookie_banners' => $validated['block_cookie_banners'] ?? true,
'block_trackers' => $validated['block_trackers'] ?? true
])
]);
ProcessCrawlShotJob::dispatch($uuid);
return response()->json([
'uuid' => $uuid,
'status' => 'queued',
'message' => 'Screenshot job initiated successfully'
], 201);
}
public function status(string $uuid): JsonResponse
{
$job = CrawlShotJob::where('uuid', $uuid)->first();
if (!$job) {
return response()->json(['error' => 'Job not found'], 404);
}
$response = [
'uuid' => $job->uuid,
'status' => $job->status,
'url' => $job->url,
'created_at' => $job->created_at->toISOString()
];
if ($job->started_at) {
$response['started_at'] = $job->started_at->toISOString();
}
if ($job->completed_at) {
$response['completed_at'] = $job->completed_at->toISOString();
}
if ($job->status === 'completed' && $job->file_path) {
$imageData = Storage::get($job->file_path);
$response['result'] = [
'image' => [
'url' => url("/api/shot/{$job->uuid}.webp"),
'raw' => base64_encode($imageData),
],
'mime_type' => 'image/webp',
'format' => 'webp',
'width' => $job->parameters['viewport_width'] ?? 1920,
'height' => $job->parameters['viewport_height'] ?? 1080,
'size' => strlen($imageData)
];
}
if ($job->status === 'failed' && $job->error_message) {
$response['error'] = $job->error_message;
}
return response()->json($response);
}
public function serve(string $uuid): Response
{
$job = CrawlShotJob::where('uuid', $uuid)->where('type', 'shot')->first();
if (!$job || $job->status !== 'completed') {
return response('Screenshot not found or not ready', 404);
}
if (!$job->file_path || !Storage::exists($job->file_path)) {
return response('Screenshot file not found', 404);
}
// Always serve as WebP
return response(Storage::get($job->file_path))
->header('Content-Type', 'image/webp');
}
public function index(): JsonResponse
{
$jobs = CrawlShotJob::where('type', 'shot')
->orderBy('created_at', 'desc')
->paginate(20);
$response = [
'jobs' => $jobs->items(),
'pagination' => [
'current_page' => $jobs->currentPage(),
'total_pages' => $jobs->lastPage(),
'total_items' => $jobs->total(),
'per_page' => $jobs->perPage()
]
];
return response()->json($response);
}
private function getMimeType(string $format): string
{
$mimeTypes = [
'jpg' => 'image/jpeg',
'png' => 'image/png',
'webp' => 'image/webp'
];
return $mimeTypes[$format] ?? 'image/webp';
}
}

View File

@@ -0,0 +1,43 @@
<?php
namespace App\Jobs;
use App\Models\CrawlShotJob;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Facades\Log;
class CleanupOldResults implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
public function handle(): void
{
$cutoffTime = now()->subHours(24);
$oldJobs = CrawlShotJob::where('created_at', '<', $cutoffTime)->get();
$deletedFiles = 0;
$deletedJobs = 0;
foreach ($oldJobs as $job) {
if ($job->file_path && Storage::exists($job->file_path)) {
Storage::delete($job->file_path);
$deletedFiles++;
}
$job->delete();
$deletedJobs++;
}
Log::info("Cleanup completed", [
'deleted_files' => $deletedFiles,
'deleted_jobs' => $deletedJobs,
'cutoff_time' => $cutoffTime->toISOString()
]);
}
}

View File

@@ -0,0 +1,88 @@
<?php
namespace App\Jobs;
use App\Models\CrawlShotJob;
use App\Services\BrowsershotService;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
class ProcessCrawlShotJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected string $jobUuid;
public function __construct(string $jobUuid)
{
$this->jobUuid = $jobUuid;
}
public function handle(): void
{
$job = CrawlShotJob::where('uuid', $this->jobUuid)->first();
if (!$job) {
Log::error("CrawlShotJob not found: {$this->jobUuid}");
return;
}
try {
$job->update([
'status' => 'processing',
'started_at' => now()
]);
$browsershot = new BrowsershotService();
if ($job->type === 'crawl') {
$result = $browsershot->crawlHtml($job->url, $job->parameters ?? []);
$this->saveCrawlResult($job, $result);
} elseif ($job->type === 'shot') {
$result = $browsershot->takeScreenshot($job->url, $job->parameters ?? []);
$this->saveScreenshotResult($job, $result);
}
$job->update([
'status' => 'completed',
'completed_at' => now()
]);
} catch (\Exception $e) {
Log::error("Job {$this->jobUuid} failed: " . $e->getMessage());
$job->update([
'status' => 'failed',
'error_message' => $e->getMessage(),
'completed_at' => now()
]);
}
}
private function saveCrawlResult(CrawlShotJob $job, string $html): void
{
$filename = "{$job->uuid}.html";
$path = "crawlshot/html/{$filename}";
Storage::put($path, $html);
$job->update(['file_path' => $path]);
}
private function saveScreenshotResult(CrawlShotJob $job, array $result): void
{
$parameters = $job->parameters ?? [];
$format = $parameters['format'] ?? 'jpg';
$filename = "{$job->uuid}.{$format}";
$path = "crawlshot/images/{$filename}";
Storage::put($path, $result['data']);
$job->update(['file_path' => $path]);
}
}

View File

@@ -0,0 +1,34 @@
<?php
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class CrawlShotJob extends Model
{
use HasFactory;
protected $fillable = [
'uuid',
'type',
'url',
'status',
'parameters',
'file_path',
'error_message',
'started_at',
'completed_at'
];
protected $casts = [
'parameters' => 'array',
'started_at' => 'datetime',
'completed_at' => 'datetime'
];
public function getRouteKeyName()
{
return 'uuid';
}
}

View File

@@ -6,11 +6,12 @@
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Foundation\Auth\User as Authenticatable;
use Illuminate\Notifications\Notifiable;
use Laravel\Sanctum\HasApiTokens;
class User extends Authenticatable
{
/** @use HasFactory<\Database\Factories\UserFactory> */
use HasFactory, Notifiable;
use HasFactory, Notifiable, HasApiTokens;
/**
* The attributes that are mass assignable.

View File

@@ -0,0 +1,36 @@
<?php
namespace App\Providers;
use Illuminate\Support\Facades\Gate;
use Laravel\Horizon\Horizon;
use Laravel\Horizon\HorizonApplicationServiceProvider;
class HorizonServiceProvider extends HorizonApplicationServiceProvider
{
/**
* Bootstrap any application services.
*/
public function boot(): void
{
parent::boot();
// Horizon::routeSmsNotificationsTo('15556667777');
// Horizon::routeMailNotificationsTo('example@example.com');
// Horizon::routeSlackNotificationsTo('slack-webhook-url', '#channel');
}
/**
* Register the Horizon gate.
*
* This gate determines who can access Horizon in non-local environments.
*/
protected function gate(): void
{
Gate::define('viewHorizon', function ($user = null) {
return in_array(optional($user)->email, [
//
]);
});
}
}

View File

@@ -0,0 +1,76 @@
<?php
namespace App\Services;
use Spatie\Browsershot\Browsershot;
class BrowsershotService
{
public function crawlHtml(string $url, array $options = []): string
{
$browsershot = $this->configureBrowsershot($url, $options);
return $browsershot->bodyHtml();
}
public function takeScreenshot(string $url, array $options = []): array
{
$browsershot = $this->configureBrowsershot($url, $options);
// Configure viewport for screenshots
$width = $options['viewport_width'] ?? 1920;
$height = $options['viewport_height'] ?? 1080;
$browsershot->windowSize($width, $height);
// Always use WebP format
$quality = $options['quality'] ?? 90;
$browsershot->setScreenshotType('webp', $quality);
$tempPath = storage_path("temp_screenshot_webp." . time() . '.webp');
$browsershot->save($tempPath);
$imageData = file_get_contents($tempPath);
unlink($tempPath);
return [
'data' => $imageData,
'mime_type' => 'image/webp',
'width' => $width,
'height' => $height
];
}
private function configureBrowsershot(string $url, array $options = []): Browsershot
{
$browsershot = Browsershot::url($url);
// Basic configuration
if (isset($options['timeout'])) {
$browsershot->timeout($options['timeout']);
}
if (isset($options['delay'])) {
$browsershot->setDelay($options['delay']);
}
if (isset($options['wait_until_network_idle']) && $options['wait_until_network_idle']) {
$browsershot->waitUntilNetworkIdle();
}
// Apply ad/tracker blocking
if (($options['block_ads'] ?? true) || ($options['block_trackers'] ?? true)) {
$easyListService = new EasyListService();
$blockedDomains = $easyListService->getBlockedDomains($url);
$blockedUrls = $easyListService->getBlockedUrls($url);
if (!empty($blockedDomains)) {
$browsershot->blockDomains($blockedDomains);
}
if (!empty($blockedUrls)) {
$browsershot->blockUrls($blockedUrls);
}
}
return $browsershot;
}
}

View File

@@ -0,0 +1,100 @@
<?php
namespace App\Services;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Log;
class EasyListService
{
private const EASYLIST_URL = 'https://easylist.to/easylist/easylist.txt';
private const CACHE_KEY = 'easylist_filters';
private const CACHE_TTL = 86400; // 24 hours
public function getBlockedDomains(string $url): array
{
$filters = $this->getFilters();
$domains = [];
foreach ($filters as $filter) {
if (strpos($filter, '||') === 0 && strpos($filter, '^') !== false) {
$domain = trim(str_replace(['||', '^'], '', $filter));
if ($this->isValidDomain($domain)) {
$domains[] = $domain;
}
}
}
return array_slice(array_unique($domains), 0, 100); // Limit to 100 domains
}
public function getBlockedUrls(string $url): array
{
$filters = $this->getFilters();
$urls = [];
foreach ($filters as $filter) {
if (strpos($filter, '||') !== 0 && strpos($filter, '#') !== 0 && strpos($filter, '!') !== 0) {
$cleanFilter = trim($filter);
if (strlen($cleanFilter) > 3 && strpos($cleanFilter, '*') !== false) {
$urls[] = str_replace('*', '', $cleanFilter);
}
}
}
return array_slice(array_unique($urls), 0, 50); // Limit to 50 URL patterns
}
private function getFilters(): array
{
return Cache::remember(self::CACHE_KEY, self::CACHE_TTL, function () {
try {
$response = Http::timeout(30)->get(self::EASYLIST_URL);
if ($response->successful()) {
$content = $response->body();
$lines = explode("\n", $content);
$filters = [];
foreach ($lines as $line) {
$line = trim($line);
if (!empty($line) && strpos($line, '!') !== 0) {
$filters[] = $line;
}
}
Log::info('EasyList filters updated', ['count' => count($filters)]);
return $filters;
}
Log::warning('Failed to fetch EasyList filters');
return $this->getFallbackFilters();
} catch (\Exception $e) {
Log::error('Error fetching EasyList filters: ' . $e->getMessage());
return $this->getFallbackFilters();
}
});
}
private function getFallbackFilters(): array
{
return [
'||googletagmanager.com^',
'||google-analytics.com^',
'||facebook.com/tr^',
'||doubleclick.net^',
'||googlesyndication.com^',
'||amazon-adsystem.com^',
'||adsystem.amazon.com^',
'||googlesyndication.com^',
'||googleadservices.com^'
];
}
private function isValidDomain(string $domain): bool
{
return filter_var($domain, FILTER_VALIDATE_DOMAIN) !== false;
}
}

View File

@@ -7,9 +7,14 @@
return Application::configure(basePath: dirname(__DIR__))
->withRouting(
web: __DIR__.'/../routes/web.php',
api: __DIR__.'/../routes/api.php',
commands: __DIR__.'/../routes/console.php',
health: '/up',
)
->withProviders([
App\Providers\AppServiceProvider::class,
Crawlshot\Laravel\CrawlshotServiceProvider::class,
])
->withMiddleware(function (Middleware $middleware): void {
//
})

View File

@@ -2,4 +2,5 @@
return [
App\Providers\AppServiceProvider::class,
App\Providers\HorizonServiceProvider::class,
];

View File

@@ -1,16 +1,32 @@
{
"$schema": "https://getcomposer.org/schema.json",
"name": "laravel/laravel",
"type": "project",
"description": "The skeleton application for the Laravel framework.",
"keywords": ["laravel", "framework"],
"name": "crawlshot/laravel",
"type": "library",
"description": "Laravel HTTP client package for Crawlshot API - web crawling and screenshot service",
"keywords": [
"laravel",
"crawler",
"screenshot",
"api-client",
"web-scraping",
"http-client"
],
"license": "MIT",
"require": {
"php": "^8.2",
"laravel/framework": "^12.0",
"laravel/tinker": "^2.10.1"
"php": "^8.3",
"illuminate/support": "^10.0|^11.0|^12.0",
"illuminate/http": "^10.0|^11.0|^12.0",
"illuminate/database": "^10.0|^11.0|^12.0",
"illuminate/queue": "^10.0|^11.0|^12.0",
"protonlabs/php-adblock-parser": "^1.1",
"spatie/browsershot": "^5.0",
"ext-imagick": "*"
},
"require-dev": {
"laravel/framework": "^12.0",
"laravel/horizon": "^5.33",
"laravel/sanctum": "^4.2",
"laravel/tinker": "^2.10.1",
"fakerphp/faker": "^1.23",
"laravel/pail": "^1.2.2",
"laravel/pint": "^1.13",
@@ -18,10 +34,12 @@
"mockery/mockery": "^1.6",
"nunomaduro/collision": "^8.6",
"pestphp/pest": "^3.8",
"pestphp/pest-plugin-laravel": "^3.2"
"pestphp/pest-plugin-laravel": "^3.2",
"orchestra/testbench": "^10.0"
},
"autoload": {
"psr-4": {
"Crawlshot\\Laravel\\": "src/",
"App\\": "app/",
"Database\\Factories\\": "database/factories/",
"Database\\Seeders\\": "database/seeders/"
@@ -34,8 +52,7 @@
},
"scripts": {
"post-autoload-dump": [
"@php artisan config:clear",
"@php artisan clear-compiled",
"Illuminate\\Foundation\\ComposerScripts::postAutoloadDump",
"@php artisan package:discover --ansi"
],
"post-update-cmd": [
@@ -48,19 +65,16 @@
"@php artisan key:generate --ansi",
"@php -r \"file_exists('database/database.sqlite') || touch('database/database.sqlite');\"",
"@php artisan migrate --graceful --ansi"
],
"dev": [
"Composer\\Config::disableProcessTimeout",
"npx concurrently -c \"#93c5fd,#c4b5fd,#fb7185,#fdba74\" \"php artisan serve\" \"php artisan queue:listen --tries=1\" \"php artisan pail --timeout=0\" \"npm run dev\" --names=server,queue,logs,vite"
],
"test": [
"@php artisan config:clear --ansi",
"@php artisan test"
]
},
"extra": {
"laravel": {
"dont-discover": []
"providers": [
"Crawlshot\\Laravel\\CrawlshotServiceProvider"
],
"aliases": {
"Crawlshot": "Crawlshot\\Laravel\\Facades\\Crawlshot"
}
}
},
"config": {

1323
composer.lock generated

File diff suppressed because it is too large Load Diff

213
config/horizon.php Normal file
View File

@@ -0,0 +1,213 @@
<?php
use Illuminate\Support\Str;
return [
/*
|--------------------------------------------------------------------------
| Horizon Domain
|--------------------------------------------------------------------------
|
| This is the subdomain where Horizon will be accessible from. If this
| setting is null, Horizon will reside under the same domain as the
| application. Otherwise, this value will serve as the subdomain.
|
*/
'domain' => env('HORIZON_DOMAIN'),
/*
|--------------------------------------------------------------------------
| Horizon Path
|--------------------------------------------------------------------------
|
| This is the URI path where Horizon will be accessible from. Feel free
| to change this path to anything you like. Note that the URI will not
| affect the paths of its internal API that aren't exposed to users.
|
*/
'path' => env('HORIZON_PATH', 'horizon'),
/*
|--------------------------------------------------------------------------
| Horizon Redis Connection
|--------------------------------------------------------------------------
|
| This is the name of the Redis connection where Horizon will store the
| meta information required for it to function. It includes the list
| of supervisors, failed jobs, job metrics, and other information.
|
*/
'use' => 'default',
/*
|--------------------------------------------------------------------------
| Horizon Redis Prefix
|--------------------------------------------------------------------------
|
| This prefix will be used when storing all Horizon data in Redis. You
| may modify the prefix when you are running multiple installations
| of Horizon on the same server so that they don't have problems.
|
*/
'prefix' => env(
'HORIZON_PREFIX',
Str::slug(env('APP_NAME', 'laravel'), '_').'_horizon:'
),
/*
|--------------------------------------------------------------------------
| Horizon Route Middleware
|--------------------------------------------------------------------------
|
| These middleware will get attached onto each Horizon route, giving you
| the chance to add your own middleware to this list or change any of
| the existing middleware. Or, you can simply stick with this list.
|
*/
'middleware' => ['web'],
/*
|--------------------------------------------------------------------------
| Queue Wait Time Thresholds
|--------------------------------------------------------------------------
|
| This option allows you to configure when the LongWaitDetected event
| will be fired. Every connection / queue combination may have its
| own, unique threshold (in seconds) before this event is fired.
|
*/
'waits' => [
'redis:default' => 60,
],
/*
|--------------------------------------------------------------------------
| Job Trimming Times
|--------------------------------------------------------------------------
|
| Here you can configure for how long (in minutes) you desire Horizon to
| persist the recent and failed jobs. Typically, recent jobs are kept
| for one hour while all failed jobs are stored for an entire week.
|
*/
'trim' => [
'recent' => 60,
'pending' => 60,
'completed' => 60,
'recent_failed' => 10080,
'failed' => 10080,
'monitored' => 10080,
],
/*
|--------------------------------------------------------------------------
| Silenced Jobs
|--------------------------------------------------------------------------
|
| Silencing a job will instruct Horizon to not place the job in the list
| of completed jobs within the Horizon dashboard. This setting may be
| used to fully remove any noisy jobs from the completed jobs list.
|
*/
'silenced' => [
// App\Jobs\ExampleJob::class,
],
/*
|--------------------------------------------------------------------------
| Metrics
|--------------------------------------------------------------------------
|
| Here you can configure how many snapshots should be kept to display in
| the metrics graph. This will get used in combination with Horizon's
| `horizon:snapshot` schedule to define how long to retain metrics.
|
*/
'metrics' => [
'trim_snapshots' => [
'job' => 24,
'queue' => 24,
],
],
/*
|--------------------------------------------------------------------------
| Fast Termination
|--------------------------------------------------------------------------
|
| When this option is enabled, Horizon's "terminate" command will not
| wait on all of the workers to terminate unless the --wait option
| is provided. Fast termination can shorten deployment delay by
| allowing a new instance of Horizon to start while the last
| instance will continue to terminate each of its workers.
|
*/
'fast_termination' => false,
/*
|--------------------------------------------------------------------------
| Memory Limit (MB)
|--------------------------------------------------------------------------
|
| This value describes the maximum amount of memory the Horizon master
| supervisor may consume before it is terminated and restarted. For
| configuring these limits on your workers, see the next section.
|
*/
'memory_limit' => 64,
/*
|--------------------------------------------------------------------------
| Queue Worker Configuration
|--------------------------------------------------------------------------
|
| Here you may define the queue worker settings used by your application
| in all environments. These supervisors and settings handle all your
| queued jobs and will be provisioned by Horizon during deployment.
|
*/
'defaults' => [
'supervisor-1' => [
'connection' => 'redis',
'queue' => ['default'],
'balance' => 'auto',
'autoScalingStrategy' => 'time',
'maxProcesses' => 1,
'maxTime' => 0,
'maxJobs' => 0,
'memory' => 128,
'tries' => 1,
'timeout' => 60,
'nice' => 0,
],
],
'environments' => [
'production' => [
'supervisor-1' => [
'maxProcesses' => 10,
'balanceMaxShift' => 1,
'balanceCooldown' => 3,
],
],
'local' => [
'supervisor-1' => [
'maxProcesses' => 3,
],
],
],
];

84
config/sanctum.php Normal file
View File

@@ -0,0 +1,84 @@
<?php
use Laravel\Sanctum\Sanctum;
return [
/*
|--------------------------------------------------------------------------
| Stateful Domains
|--------------------------------------------------------------------------
|
| Requests from the following domains / hosts will receive stateful API
| authentication cookies. Typically, these should include your local
| and production domains which access your API via a frontend SPA.
|
*/
'stateful' => explode(',', env('SANCTUM_STATEFUL_DOMAINS', sprintf(
'%s%s',
'localhost,localhost:3000,127.0.0.1,127.0.0.1:8000,::1',
Sanctum::currentApplicationUrlWithPort(),
// Sanctum::currentRequestHost(),
))),
/*
|--------------------------------------------------------------------------
| Sanctum Guards
|--------------------------------------------------------------------------
|
| This array contains the authentication guards that will be checked when
| Sanctum is trying to authenticate a request. If none of these guards
| are able to authenticate the request, Sanctum will use the bearer
| token that's present on an incoming request for authentication.
|
*/
'guard' => ['web'],
/*
|--------------------------------------------------------------------------
| Expiration Minutes
|--------------------------------------------------------------------------
|
| This value controls the number of minutes until an issued token will be
| considered expired. This will override any values set in the token's
| "expires_at" attribute, but first-party sessions are not affected.
|
*/
'expiration' => null,
/*
|--------------------------------------------------------------------------
| Token Prefix
|--------------------------------------------------------------------------
|
| Sanctum can prefix new tokens in order to take advantage of numerous
| security scanning initiatives maintained by open source platforms
| that notify developers if they commit tokens into repositories.
|
| See: https://docs.github.com/en/code-security/secret-scanning/about-secret-scanning
|
*/
'token_prefix' => env('SANCTUM_TOKEN_PREFIX', ''),
/*
|--------------------------------------------------------------------------
| Sanctum Middleware
|--------------------------------------------------------------------------
|
| When authenticating your first-party SPA with Sanctum you may need to
| customize some of the middleware Sanctum uses while processing the
| request. You may change the middleware listed below as required.
|
*/
'middleware' => [
'authenticate_session' => Laravel\Sanctum\Http\Middleware\AuthenticateSession::class,
'encrypt_cookies' => Illuminate\Cookie\Middleware\EncryptCookies::class,
'validate_csrf_token' => Illuminate\Foundation\Http\Middleware\ValidateCsrfToken::class,
],
];

View File

@@ -0,0 +1,598 @@
{
"info": {
"name": "Crawlshot API",
"description": "Complete API collection for Crawlshot - A self-hosted web crawling and screenshot service built with Laravel and Spatie Browsershot",
"version": "1.0.0",
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
},
"auth": {
"type": "bearer",
"bearer": [
{
"key": "token",
"value": "{{api_token}}",
"type": "string"
}
]
},
"variable": [
{
"key": "base_url",
"value": "https://crawlshot.test",
"type": "string"
},
{
"key": "api_token",
"value": "1|rrWUM5ZkmLfGipkm1oIusYX45KbukIekUwMjgB3Nd1121a5c",
"type": "string"
}
],
"item": [
{
"name": "Health Check",
"request": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
}
],
"url": {
"raw": "{{base_url}}/api/health",
"host": [
"{{base_url}}"
],
"path": [
"api",
"health"
]
},
"description": "Health check endpoint to verify the service is running. No authentication required."
},
"response": [
{
"name": "Success",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/health",
"host": [
"{{base_url}}"
],
"path": [
"api",
"health"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"status\": \"healthy\",\n \"timestamp\": \"2025-08-10T09:54:52.195383Z\",\n \"service\": \"crawlshot\"\n}"
}
]
},
{
"name": "Web Crawling",
"item": [
{
"name": "Initiate Crawl Job",
"request": {
"method": "POST",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
},
{
"key": "Content-Type",
"value": "application/json",
"type": "text"
}
],
"body": {
"mode": "raw",
"raw": "{\n \"url\": \"https://example.com\",\n \"timeout\": 30,\n \"delay\": 2000,\n \"block_ads\": true,\n \"block_cookie_banners\": true,\n \"block_trackers\": true,\n \"wait_until_network_idle\": true\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{base_url}}/api/crawl",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl"
]
},
"description": "Initiate a web crawling job to extract HTML content from a URL.\n\n**Parameters:**\n- `url` (required): Target URL to crawl\n- `timeout` (optional): Request timeout in seconds (5-300)\n- `delay` (optional): Wait time before capture in milliseconds (0-30000)\n- `block_ads` (optional): Block ads using EasyList filters (default: true)\n- `block_cookie_banners` (optional): Block cookie banners (default: true)\n- `block_trackers` (optional): Block tracking scripts (default: true)\n- `wait_until_network_idle` (optional): Wait for network activity to cease (default: false)"
},
"response": [
{
"name": "Success",
"originalRequest": {
"method": "POST",
"header": [
{
"key": "Accept",
"value": "application/json"
},
{
"key": "Content-Type",
"value": "application/json"
}
],
"body": {
"mode": "raw",
"raw": "{\n \"url\": \"https://example.com\",\n \"timeout\": 30,\n \"block_ads\": true\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{base_url}}/api/crawl",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl"
]
}
},
"status": "Created",
"code": 201,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"b5dc483b-f62d-4e40-8b9e-4715324a8cbb\",\n \"status\": \"queued\",\n \"message\": \"Crawl job initiated successfully\"\n}"
}
]
},
{
"name": "Get Crawl Status & Results",
"request": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
}
],
"url": {
"raw": "{{base_url}}/api/crawl/:uuid",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl",
":uuid"
],
"variable": [
{
"key": "uuid",
"value": "b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"description": "Job UUID returned from crawl initiation"
}
]
},
"description": "Check the status and retrieve results of a crawl job. When completed, returns the full HTML content."
},
"response": [
{
"name": "Queued",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl",
"b5dc483b-f62d-4e40-8b9e-4715324a8cbb"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"b5dc483b-f62d-4e40-8b9e-4715324a8cbb\",\n \"status\": \"queued\",\n \"url\": \"https://example.com\",\n \"created_at\": \"2025-08-10T10:00:42.000000Z\"\n}"
},
{
"name": "Processing",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl",
"b5dc483b-f62d-4e40-8b9e-4715324a8cbb"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"b5dc483b-f62d-4e40-8b9e-4715324a8cbb\",\n \"status\": \"processing\",\n \"url\": \"https://example.com\",\n \"created_at\": \"2025-08-10T10:00:42.000000Z\",\n \"started_at\": \"2025-08-10T10:00:45.000000Z\"\n}"
},
{
"name": "Completed",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl",
"b5dc483b-f62d-4e40-8b9e-4715324a8cbb"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"b5dc483b-f62d-4e40-8b9e-4715324a8cbb\",\n \"status\": \"completed\",\n \"url\": \"https://example.com\",\n \"created_at\": \"2025-08-10T10:00:42.000000Z\",\n \"started_at\": \"2025-08-10T10:00:45.000000Z\",\n \"completed_at\": \"2025-08-10T10:01:12.000000Z\",\n \"result\": \"<!doctype html>\\n<html>\\n<head>\\n <title>Example Domain</title>\\n</head>\\n<body>\\n <h1>Example Domain</h1>\\n <p>This domain is for use in illustrative examples...</p>\\n</body>\\n</html>\"\n}"
},
{
"name": "Failed",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/crawl/b5dc483b-f62d-4e40-8b9e-4715324a8cbb",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl",
"b5dc483b-f62d-4e40-8b9e-4715324a8cbb"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"b5dc483b-f62d-4e40-8b9e-4715324a8cbb\",\n \"status\": \"failed\",\n \"url\": \"https://example.com\",\n \"created_at\": \"2025-08-10T10:00:42.000000Z\",\n \"started_at\": \"2025-08-10T10:00:45.000000Z\",\n \"completed_at\": \"2025-08-10T10:00:50.000000Z\",\n \"error\": \"Timeout: Navigation failed after 30 seconds\"\n}"
}
]
},
{
"name": "List Crawl Jobs",
"request": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
}
],
"url": {
"raw": "{{base_url}}/api/crawl",
"host": [
"{{base_url}}"
],
"path": [
"api",
"crawl"
]
},
"description": "List all crawl jobs with pagination. Optional endpoint for debugging and monitoring."
},
"response": []
}
]
},
{
"name": "Screenshots",
"item": [
{
"name": "Initiate Screenshot Job",
"request": {
"method": "POST",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
},
{
"key": "Content-Type",
"value": "application/json",
"type": "text"
}
],
"body": {
"mode": "raw",
"raw": "{\n \"url\": \"https://example.com\",\n \"viewport_width\": 1920,\n \"viewport_height\": 1080,\n \"format\": \"webp\",\n \"quality\": 90,\n \"timeout\": 30,\n \"delay\": 2000,\n \"block_ads\": true,\n \"block_cookie_banners\": true,\n \"block_trackers\": true\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{base_url}}/api/shot",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot"
]
},
"description": "Initiate a screenshot job to capture an image of a webpage.\n\n**Parameters:**\n- `url` (required): Target URL to screenshot\n- `viewport_width` (optional): Viewport width in pixels (320-3840, default: 1920)\n- `viewport_height` (optional): Viewport height in pixels (240-2160, default: 1080)\n- `format` (optional): Image format - jpg, png, webp (default: jpg)\n- `quality` (optional): Image quality 1-100 (default: 90)\n- `timeout` (optional): Request timeout in seconds (5-300)\n- `delay` (optional): Wait time before capture in milliseconds (0-30000)\n- `block_ads` (optional): Block ads using EasyList filters (default: true)\n- `block_cookie_banners` (optional): Block cookie banners (default: true)\n- `block_trackers` (optional): Block tracking scripts (default: true)"
},
"response": [
{
"name": "Success",
"originalRequest": {
"method": "POST",
"header": [
{
"key": "Accept",
"value": "application/json"
},
{
"key": "Content-Type",
"value": "application/json"
}
],
"body": {
"mode": "raw",
"raw": "{\n \"url\": \"https://example.com\",\n \"viewport_width\": 1280,\n \"viewport_height\": 720,\n \"format\": \"webp\",\n \"block_ads\": true\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{base_url}}/api/shot",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot"
]
}
},
"status": "Created",
"code": 201,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"fe37d511-99cb-4295-853b-6d484900a851\",\n \"status\": \"queued\",\n \"message\": \"Screenshot job initiated successfully\"\n}"
}
]
},
{
"name": "Get Screenshot Status & Results",
"request": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
}
],
"url": {
"raw": "{{base_url}}/api/shot/:uuid",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot",
":uuid"
],
"variable": [
{
"key": "uuid",
"value": "fe37d511-99cb-4295-853b-6d484900a851",
"description": "Job UUID returned from screenshot initiation"
}
]
},
"description": "Check the status and retrieve results of a screenshot job. When completed, returns base64 image data and download URL."
},
"response": [
{
"name": "Queued",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/shot/fe37d511-99cb-4295-853b-6d484900a851",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot",
"fe37d511-99cb-4295-853b-6d484900a851"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"fe37d511-99cb-4295-853b-6d484900a851\",\n \"status\": \"queued\",\n \"url\": \"https://example.com\",\n \"created_at\": \"2025-08-10T10:05:42.000000Z\"\n}"
},
{
"name": "Completed",
"originalRequest": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json"
}
],
"url": {
"raw": "{{base_url}}/api/shot/fe37d511-99cb-4295-853b-6d484900a851",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot",
"fe37d511-99cb-4295-853b-6d484900a851"
]
}
},
"status": "OK",
"code": 200,
"_postman_previewlanguage": "json",
"header": [],
"cookie": [],
"body": "{\n \"uuid\": \"fe37d511-99cb-4295-853b-6d484900a851\",\n \"status\": \"completed\",\n \"url\": \"https://example.com\",\n \"created_at\": \"2025-08-10T10:05:42.000000Z\",\n \"started_at\": \"2025-08-10T10:05:45.000000Z\",\n \"completed_at\": \"2025-08-10T10:06:12.000000Z\",\n \"result\": {\n \"image_data\": \"iVBORw0KGgoAAAANSUhEUgAAAHgAAAAyCAYAAACXpx/Y...\",\n \"download_url\": \"https://crawlshot.test/api/shot/fe37d511-99cb-4295-853b-6d484900a851/download\",\n \"mime_type\": \"image/webp\",\n \"format\": \"webp\",\n \"width\": 1920,\n \"height\": 1080,\n \"size\": 45678\n }\n}"
}
]
},
{
"name": "Download Screenshot File",
"request": {
"method": "GET",
"header": [],
"url": {
"raw": "{{base_url}}/api/shot/:uuid/download",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot",
":uuid",
"download"
],
"variable": [
{
"key": "uuid",
"value": "fe37d511-99cb-4295-853b-6d484900a851",
"description": "Job UUID"
}
]
},
"description": "Download the screenshot file directly. This endpoint returns the actual image file with appropriate headers for downloading."
},
"response": []
},
{
"name": "List Screenshot Jobs",
"request": {
"method": "GET",
"header": [
{
"key": "Accept",
"value": "application/json",
"type": "text"
}
],
"url": {
"raw": "{{base_url}}/api/shot",
"host": [
"{{base_url}}"
],
"path": [
"api",
"shot"
]
},
"description": "List all screenshot jobs with pagination. Optional endpoint for debugging and monitoring."
},
"response": []
}
]
}
],
"event": [
{
"listen": "prerequest",
"script": {
"type": "text/javascript",
"exec": [
""
]
}
},
{
"listen": "test",
"script": {
"type": "text/javascript",
"exec": [
""
]
}
}
]
}

View File

@@ -0,0 +1,33 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::create('personal_access_tokens', function (Blueprint $table) {
$table->id();
$table->morphs('tokenable');
$table->text('name');
$table->string('token', 64)->unique();
$table->text('abilities')->nullable();
$table->timestamp('last_used_at')->nullable();
$table->timestamp('expires_at')->nullable()->index();
$table->timestamps();
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('personal_access_tokens');
}
};

View File

@@ -0,0 +1,36 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::create('crawl_shot_jobs', function (Blueprint $table) {
$table->id();
$table->uuid('uuid')->unique();
$table->string('type'); // 'crawl' or 'shot'
$table->string('url');
$table->string('status')->default('queued'); // queued, processing, completed, failed
$table->json('parameters')->nullable(); // viewport_width, viewport_height, format, timeout, etc.
$table->string('file_path')->nullable(); // path to saved result
$table->text('error_message')->nullable();
$table->timestamp('started_at')->nullable();
$table->timestamp('completed_at')->nullable();
$table->timestamps();
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('crawl_shot_jobs');
}
};

View File

@@ -13,11 +13,8 @@ class DatabaseSeeder extends Seeder
*/
public function run(): void
{
// User::factory(10)->create();
User::factory()->create([
'name' => 'Test User',
'email' => 'test@example.com',
$this->call([
UserSeeder::class,
]);
}
}

View File

@@ -0,0 +1,29 @@
<?php
namespace Database\Seeders;
use App\Models\User;
use Illuminate\Database\Seeder;
class UserSeeder extends Seeder
{
public function run(): void
{
if (User::exists()) {
$this->command->error("Users already exist! This seeder can only be run once.");
return;
}
$user = User::create([
'name' => 'Crawlshot API User',
'email' => 'api@crawlshot.test',
'password' => bcrypt('password')
]);
$token = $user->createToken('crawlshot-api')->plainTextToken;
$this->command->info("User created: {$user->email}");
$this->command->info("API Token: {$token}");
$this->command->line("Use this token in Authorization header: Bearer {$token}");
}
}

3298
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -13,5 +13,8 @@
"laravel-vite-plugin": "^2.0.0",
"tailwindcss": "^4.0.0",
"vite": "^7.0.4"
},
"dependencies": {
"puppeteer": "^24.16.0"
}
}

49
routes/api.php Normal file
View File

@@ -0,0 +1,49 @@
<?php
use App\Http\Controllers\Api\CrawlController;
use App\Http\Controllers\Api\ShotController;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Route;
/*
|--------------------------------------------------------------------------
| API Routes
|--------------------------------------------------------------------------
|
| Here is where you can register API routes for your application. These
| routes are loaded by the RouteServiceProvider and all of them will
| be assigned to the "api" middleware group. Make something great!
|
*/
// File serving endpoints (no auth required for direct file access)
Route::get('crawl/{uuid}.html', [CrawlController::class, 'serve'])->name('api.crawl.serve');
Route::get('shot/{uuid}.webp', [ShotController::class, 'serve'])->name('api.shot.serve');
// All other API routes require Sanctum authentication
Route::middleware(['auth:sanctum'])->group(function () {
// Crawl endpoints
Route::prefix('crawl')->group(function () {
Route::post('/', [CrawlController::class, 'crawl'])->name('api.crawl.create');
Route::get('/{uuid}', [CrawlController::class, 'status'])->name('api.crawl.status');
Route::get('/', [CrawlController::class, 'index'])->name('api.crawl.index'); // Optional: list all crawl jobs
});
// Screenshot endpoints
Route::prefix('shot')->group(function () {
Route::post('/', [ShotController::class, 'shot'])->name('api.shot.create');
Route::get('/{uuid}', [ShotController::class, 'status'])->name('api.shot.status');
Route::get('/', [ShotController::class, 'index'])->name('api.shot.index'); // Optional: list all screenshot jobs
});
});
// Health check endpoint (no auth required)
Route::get('/health', function () {
return response()->json([
'status' => 'healthy',
'timestamp' => now()->toISOString(),
'service' => 'crawlshot'
]);
})->name('api.health');

111
src/CrawlshotClient.php Normal file
View File

@@ -0,0 +1,111 @@
<?php
namespace Crawlshot\Laravel;
use Illuminate\Support\Facades\Http;
class CrawlshotClient
{
protected string $baseUrl;
protected ?string $token;
public function __construct(string $baseUrl, ?string $token = null)
{
$this->baseUrl = rtrim($baseUrl, '/');
$this->token = $token;
}
/**
* POST /api/crawl - Create crawl job
*/
public function createCrawl(string $url, array $options = []): array
{
return $this->makeRequest('POST', '/api/crawl', array_merge(['url' => $url], $options));
}
/**
* GET /api/crawl/{uuid} - Get crawl status
*/
public function getCrawlStatus(string $uuid): array
{
return $this->makeRequest('GET', "/api/crawl/{$uuid}");
}
/**
* GET /api/crawl - List all crawl jobs
*/
public function listCrawls(): array
{
return $this->makeRequest('GET', '/api/crawl');
}
/**
* POST /api/shot - Create screenshot job
*/
public function createShot(string $url, array $options = []): array
{
return $this->makeRequest('POST', '/api/shot', array_merge(['url' => $url], $options));
}
/**
* GET /api/shot/{uuid} - Get screenshot status
*/
public function getShotStatus(string $uuid): array
{
return $this->makeRequest('GET', "/api/shot/{$uuid}");
}
/**
* GET /api/shot/{uuid}/download - Download screenshot file
*/
public function downloadShot(string $uuid): string
{
$response = Http::when($this->token, function ($http) {
return $http->withToken($this->token);
})->get($this->baseUrl . "/api/shot/{$uuid}/download");
if ($response->failed()) {
throw new \Exception("Failed to download screenshot: " . $response->body());
}
return $response->body();
}
/**
* GET /api/shot - List all screenshot jobs
*/
public function listShots(): array
{
return $this->makeRequest('GET', '/api/shot');
}
/**
* GET /api/health - Health check
*/
public function health(): array
{
return $this->makeRequest('GET', '/api/health', [], false);
}
/**
* Make HTTP request to API
*/
protected function makeRequest(string $method, string $endpoint, array $data = [], bool $requiresAuth = true): array
{
$http = Http::when($requiresAuth && $this->token, function ($http) {
return $http->withToken($this->token);
});
$response = match (strtoupper($method)) {
'GET' => $http->get($this->baseUrl . $endpoint),
'POST' => $http->post($this->baseUrl . $endpoint, $data),
default => throw new \InvalidArgumentException("Unsupported HTTP method: {$method}")
};
if ($response->failed()) {
throw new \Exception("API request failed: " . $response->body());
}
return $response->json();
}
}

View File

@@ -0,0 +1,39 @@
<?php
namespace Crawlshot\Laravel;
use Illuminate\Support\ServiceProvider;
class CrawlshotServiceProvider extends ServiceProvider
{
/**
* Register any application services.
*/
public function register(): void
{
// Merge package configuration
$this->mergeConfigFrom(__DIR__ . '/config/crawlshot.php', 'crawlshot');
// Register the client
$this->app->singleton('crawlshot', function ($app) {
return new CrawlshotClient(
$app['config']['crawlshot']['base_url'],
$app['config']['crawlshot']['token']
);
});
// Register facade alias
$this->app->alias('crawlshot', CrawlshotClient::class);
}
/**
* Bootstrap any application services.
*/
public function boot(): void
{
// Publish configuration
$this->publishes([
__DIR__ . '/config/crawlshot.php' => config_path('crawlshot.php'),
], 'crawlshot-config');
}
}

28
src/Facades/Crawlshot.php Normal file
View File

@@ -0,0 +1,28 @@
<?php
namespace Crawlshot\Laravel\Facades;
use Illuminate\Support\Facades\Facade;
/**
* @method static array createCrawl(string $url, array $options = [])
* @method static array getCrawlStatus(string $uuid)
* @method static array listCrawls()
* @method static array createShot(string $url, array $options = [])
* @method static array getShotStatus(string $uuid)
* @method static string downloadShot(string $uuid)
* @method static array listShots()
* @method static array health()
*
* @see \Crawlshot\Laravel\CrawlshotClient
*/
class Crawlshot extends Facade
{
/**
* Get the registered name of the component.
*/
protected static function getFacadeAccessor(): string
{
return 'crawlshot';
}
}

32
src/config/crawlshot.php Normal file
View File

@@ -0,0 +1,32 @@
<?php
return [
/*
|--------------------------------------------------------------------------
| Crawlshot API Configuration
|--------------------------------------------------------------------------
|
| Configuration for the Crawlshot API client package.
|
*/
/*
|--------------------------------------------------------------------------
| Base URL
|--------------------------------------------------------------------------
|
| The base URL of your Crawlshot API service.
|
*/
'base_url' => env('CRAWLSHOT_BASE_URL', 'https://crawlshot.test'),
/*
|--------------------------------------------------------------------------
| Authentication Token
|--------------------------------------------------------------------------
|
| Your Sanctum authentication token for the Crawlshot API.
|
*/
'token' => env('CRAWLSHOT_TOKEN'),
];