Files
productalert/app/Jobs/Tasks/UrlCrawlerTask.php
2023-10-01 20:39:18 +08:00

240 lines
7.5 KiB
PHP

<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use Exception;
use Illuminate\Support\Facades\Http;
use Intervention\Image\Facades\Image;
use Minifier\TinyMinify;
use Spatie\Browsershot\Browsershot;
use Spatie\Browsershot\Exceptions\UnsuccessfulResponse;
use Symfony\Component\DomCrawler\Crawler;
use thiagoalessio\TesseractOCR\TesseractOCR;
class UrlCrawlerTask
{
public static function handle(string $url, $directory, $postfix = null, $strip_html = false, $parse_images = false)
{
$slug = str_slug($url);
$cached_url = $url; // self::getGoogleCachedUrl($url, false);
$postfix = strval($postfix);
$driver = 'r2';
$filename = $slug.'-'.$postfix.'.html';
$user_agent = config('platform.proxy.user_agent');
$disk_url = $directory.$filename;
$raw_html = null;
$status_code = 0;
$costs = [];
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
try {
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
if (is_null($raw_html)) {
$status_code = -1;
throw new Exception('Not stored.');
}
} catch (Exception $e) {
$raw_html = null;
}
if (is_null($raw_html)) {
try {
$response = Http::withHeaders([
'User-Agent' => $user_agent,
])
->withOptions([
'proxy' => $unblocker_proxy_server,
'timeout' => 1000,
'verify' => false,
])
->get($cached_url);
if ($response->successful())
{
$raw_html = $response->body();
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'unblocker');
} else {
$raw_html = null;
$status_code = -3;
$response->throw();
}
// $browsershot = new Browsershot();
// $browsershot->setUrl($cached_url)
// ->setOption('args', ['headless: "new"'])
// ->noSandbox()
// ->setOption('args', ['--disable-web-security'])
// ->userAgent($user_agent)
// ->ignoreHttpsErrors()
// ->preventUnsuccessfulResponse()
// ->timeout(10)
// ->setProxyServer($proxy_server)
// ->userAgent($user_agent);
// if (app()->environment() == 'local') {
// $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
// }
// //dump($browsershot);
// $raw_html = $browsershot->bodyHtml();
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
// $costs['html'] = $browsershot_cost;
} catch (UnsuccessfulResponse|Exception $e) {
$raw_html = null;
$status_code = -3;
throw $e;
}
if (! is_empty($raw_html)) {
OSSUploader::uploadFile($driver, $directory, $filename, $raw_html);
$status_code = 1;
}
}
if (! is_null($raw_html)) {
//$raw_html = self::minifyAndCleanHtml($raw_html);
$jsonld = self::getJsonLd($raw_html);
return (object) [
'response' => (object) [
'url' => $url,
'postfix' => $postfix,
'filename' => $disk_url,
'raw_html' => $raw_html,
'jsonld' => $jsonld,
'status_code' => $status_code,
'costs' => $costs,
'total_cost' => array_sum(array_values($costs)),
],
];
}
return (object) [
'response' => (object) [
'url' => $url,
'postfix' => $postfix,
'filename' => null,
'raw_html' => null,
'jsonld' => [],
'status_code' => $status_code,
'costs' => $costs,
'total_cost' => 0,
],
];
}
private static function getJsonLd(string $raw_html)
{
$crawler = new Crawler($raw_html);
try {
$jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) {
return $node->text();
});
} catch (Exception $e) {
return [];
}
$contents = [];
foreach ($jsonld as $content) {
try {
$contents[] = json_decode($content);
} catch (Exception $e) {
}
}
return $contents;
}
private static function minifyAndCleanHtml(string $raw_html)
{
$raw_html = self::minifyHTML($raw_html);
$crawler = new Crawler($raw_html);
// Directly loop through the DOM and remove 'class' and 'id' attributes
foreach ($crawler as $domElement) {
/** @var \DOMNodeList $nodes */
$nodes = $domElement->getElementsByTagName('*');
foreach ($nodes as $node) {
/** @var \DOMElement $node */
$node->removeAttribute('class');
$node->removeAttribute('id');
$node->removeAttribute('style');
}
}
// Remove <style> tags and their content
$styleTags = $domElement->getElementsByTagName('style');
for ($i = $styleTags->length; --$i >= 0;) {
$styleNode = $styleTags->item($i);
$styleNode->parentNode->removeChild($styleNode);
}
// Output the manipulated HTML
return $crawler->html();
}
private static function minifyHTML($input) {
// Remove extra white space between HTML tags
$input = preg_replace('/>\s+</', '><', $input);
// Remove comments
$input = preg_replace('/<!--(.|\s)*?-->/', '', $input);
return $input;
}
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
{
$url = self::stripUrlQueryParameters($url);
$cached_url = "https://webcache.googleusercontent.com/search?q=cache:{$url}";
if ($stripHtml) {
$cached_url .= '&strip=1';
}
return $cached_url;
}
private static function stripUrlQueryParameters(string $url)
{
// Parse the URL into its components
$parts = parse_url($url);
// Rebuild the URL without the query component
$newUrl = $parts['scheme'].'://'.$parts['host'];
if (isset($parts['path'])) {
$newUrl .= $parts['path'];
}
if (isset($parts['fragment'])) {
$newUrl .= '#'.$parts['fragment'];
}
return $newUrl;
}
}