Files
echoscoop/app/Jobs/Tasks/ScrapeUrlBodyTask.php
Charles Teh c53918d03b Add (scraper)
Update (ai): integrate scraper
2023-09-25 19:39:13 +08:00

73 lines
1.5 KiB
PHP

<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use \Illuminate\Support\Facades\Http;
use Carbon\Carbon;
use Storage;
use Exception;
use andreskrey\Readability\Readability;
use andreskrey\Readability\Configuration;
use andreskrey\Readability\ParseException;
class ScrapeUrlBodyTask
{
public static function handle(string $url)
{
$slug = str_slug($url);
$disk_url = '/scraped/' . $slug . '.html';
$html_content = null;
try {
$html_content = OSSUploader::readFile('r2','/scraped/',$slug.'.html');
if (is_null($disk_url))
{
throw Exception('Not stored.');
}
}
catch (Exception $e) {
$html_content = null;
}
if (is_null($html_content))
{
$proxy = 'gate.smartproxy.com:10000';
$user = 'sp5bbkzj7e';
$psw = 'yTtk2cc5kg23kIkSSr';
$response = Http::withOptions([
'proxy' => "http://$user:$psw@$proxy",
])->get($url);
if ($response->successful()) {
$html_content = $response->body();
OSSUploader::uploadFile('r2','/scraped/',$slug.'.html', $html_content);
}
}
//dump("Initial: " . strlen($html_content));
$readability = new Readability(new Configuration());
try {
$readability->parse($html_content);
$html_content = strip_tags($readability->getContent());
//dd($readability);
} catch (ParseException $e) {
}
//dump("After: " . strlen($html_content));
return $html_content;
}
}