237 lines
7.4 KiB
PHP
237 lines
7.4 KiB
PHP
<?php
|
|
|
|
namespace App\Jobs\Tasks;
|
|
|
|
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
|
use Exception;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Spatie\Browsershot\Browsershot;
|
|
use Spatie\Browsershot\Exceptions\UnsuccessfulResponse;
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
|
|
class UrlCrawlerTask
|
|
{
|
|
public static function handle(string $url, $directory, $postfix = null, $strip_html = false, $parse_images = false)
|
|
{
|
|
$slug = str_slug($url);
|
|
|
|
$cached_url = $url; // self::getGoogleCachedUrl($url, false);
|
|
|
|
$postfix = strval($postfix);
|
|
|
|
$driver = 'r2';
|
|
$filename = $slug.'-'.$postfix.'.html';
|
|
$user_agent = config('platform.proxy.user_agent');
|
|
$disk_url = $directory.$filename;
|
|
|
|
$raw_html = null;
|
|
$status_code = 0;
|
|
|
|
$costs = [];
|
|
|
|
$unblocker_proxy_server = get_smartproxy_unblocker_server();
|
|
$rotating_proxy_server = get_smartproxy_rotating_server();
|
|
|
|
try {
|
|
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
|
|
|
|
if (is_null($raw_html)) {
|
|
$status_code = -1;
|
|
throw new Exception('Not stored.');
|
|
}
|
|
} catch (Exception $e) {
|
|
$raw_html = null;
|
|
}
|
|
|
|
if (is_null($raw_html)) {
|
|
|
|
try {
|
|
|
|
$response = Http::withHeaders([
|
|
'User-Agent' => $user_agent,
|
|
])
|
|
->withOptions([
|
|
'proxy' => $unblocker_proxy_server,
|
|
'timeout' => 1000,
|
|
'verify' => false,
|
|
])
|
|
->get($cached_url);
|
|
|
|
if ($response->successful()) {
|
|
$raw_html = $response->body();
|
|
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'unblocker');
|
|
} else {
|
|
$raw_html = null;
|
|
$status_code = -3;
|
|
$response->throw();
|
|
}
|
|
|
|
// $browsershot = new Browsershot();
|
|
|
|
// $browsershot->setUrl($cached_url)
|
|
// ->setOption('args', ['headless: "new"'])
|
|
// ->noSandbox()
|
|
// ->setOption('args', ['--disable-web-security'])
|
|
// ->userAgent($user_agent)
|
|
// ->ignoreHttpsErrors()
|
|
// ->preventUnsuccessfulResponse()
|
|
// ->timeout(10)
|
|
// ->setProxyServer($proxy_server)
|
|
// ->userAgent($user_agent);
|
|
|
|
// if (app()->environment() == 'local') {
|
|
// $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
|
|
// }
|
|
|
|
// //dump($browsershot);
|
|
|
|
// $raw_html = $browsershot->bodyHtml();
|
|
|
|
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
|
|
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
|
|
|
|
// $costs['html'] = $browsershot_cost;
|
|
|
|
} catch (UnsuccessfulResponse|Exception $e) {
|
|
$raw_html = null;
|
|
$status_code = -3;
|
|
throw $e;
|
|
}
|
|
|
|
if (! is_empty($raw_html)) {
|
|
OSSUploader::uploadFile($driver, $directory, $filename, $raw_html);
|
|
$status_code = 1;
|
|
}
|
|
}
|
|
|
|
if (! is_null($raw_html)) {
|
|
|
|
//$raw_html = self::minifyAndCleanHtml($raw_html);
|
|
|
|
$jsonld = self::getJsonLd($raw_html);
|
|
|
|
return (object) [
|
|
'response' => (object) [
|
|
'url' => $url,
|
|
'postfix' => $postfix,
|
|
'filename' => $disk_url,
|
|
'raw_html' => $raw_html,
|
|
'jsonld' => $jsonld,
|
|
'status_code' => $status_code,
|
|
'costs' => $costs,
|
|
'total_cost' => array_sum(array_values($costs)),
|
|
],
|
|
];
|
|
}
|
|
|
|
return (object) [
|
|
'response' => (object) [
|
|
'url' => $url,
|
|
'postfix' => $postfix,
|
|
'filename' => null,
|
|
'raw_html' => null,
|
|
'jsonld' => [],
|
|
'status_code' => $status_code,
|
|
'costs' => $costs,
|
|
'total_cost' => 0,
|
|
],
|
|
];
|
|
}
|
|
|
|
private static function getJsonLd(string $raw_html)
|
|
{
|
|
$crawler = new Crawler($raw_html);
|
|
|
|
try {
|
|
$jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) {
|
|
return $node->text();
|
|
});
|
|
} catch (Exception $e) {
|
|
return [];
|
|
}
|
|
|
|
$contents = [];
|
|
|
|
foreach ($jsonld as $content) {
|
|
try {
|
|
$contents[] = json_decode($content);
|
|
} catch (Exception $e) {
|
|
}
|
|
}
|
|
|
|
return $contents;
|
|
}
|
|
|
|
private static function minifyAndCleanHtml(string $raw_html)
|
|
{
|
|
$raw_html = self::minifyHTML($raw_html);
|
|
|
|
$crawler = new Crawler($raw_html);
|
|
|
|
// Directly loop through the DOM and remove 'class' and 'id' attributes
|
|
foreach ($crawler as $domElement) {
|
|
/** @var \DOMNodeList $nodes */
|
|
$nodes = $domElement->getElementsByTagName('*');
|
|
foreach ($nodes as $node) {
|
|
/** @var \DOMElement $node */
|
|
$node->removeAttribute('class');
|
|
$node->removeAttribute('id');
|
|
$node->removeAttribute('style');
|
|
}
|
|
}
|
|
|
|
// Remove <style> tags and their content
|
|
$styleTags = $domElement->getElementsByTagName('style');
|
|
for ($i = $styleTags->length; --$i >= 0;) {
|
|
$styleNode = $styleTags->item($i);
|
|
$styleNode->parentNode->removeChild($styleNode);
|
|
}
|
|
|
|
// Output the manipulated HTML
|
|
return $crawler->html();
|
|
}
|
|
|
|
private static function minifyHTML($input)
|
|
{
|
|
// Remove extra white space between HTML tags
|
|
$input = preg_replace('/>\s+</', '><', $input);
|
|
|
|
// Remove comments
|
|
$input = preg_replace('/<!--(.|\s)*?-->/', '', $input);
|
|
|
|
return $input;
|
|
}
|
|
|
|
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
|
|
{
|
|
$url = self::stripUrlQueryParameters($url);
|
|
$cached_url = "https://webcache.googleusercontent.com/search?q=cache:{$url}";
|
|
|
|
if ($stripHtml) {
|
|
$cached_url .= '&strip=1';
|
|
}
|
|
|
|
return $cached_url;
|
|
|
|
}
|
|
|
|
private static function stripUrlQueryParameters(string $url)
|
|
{
|
|
// Parse the URL into its components
|
|
$parts = parse_url($url);
|
|
|
|
// Rebuild the URL without the query component
|
|
$newUrl = $parts['scheme'].'://'.$parts['host'];
|
|
|
|
if (isset($parts['path'])) {
|
|
$newUrl .= $parts['path'];
|
|
}
|
|
|
|
if (isset($parts['fragment'])) {
|
|
$newUrl .= '#'.$parts['fragment'];
|
|
}
|
|
|
|
return $newUrl;
|
|
}
|
|
}
|