482 lines
17 KiB
PHP
482 lines
17 KiB
PHP
<?php
|
|
|
|
namespace App\Jobs\Tasks;
|
|
|
|
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
|
use Exception;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Intervention\Image\Facades\Image;
|
|
use Minifier\TinyMinify;
|
|
use Spatie\Browsershot\Browsershot;
|
|
use Spatie\Browsershot\Exceptions\UnsuccessfulResponse;
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
|
|
|
class UrlCrawlerTask
|
|
{
|
|
public static function handle(string $url, $directory, $postfix = null, $strip_html = false, $parse_images = false)
|
|
{
|
|
$slug = str_slug($url);
|
|
|
|
$cached_url = $url; // self::getGoogleCachedUrl($url, false);
|
|
|
|
$postfix = strval($postfix);
|
|
|
|
$driver = 'r2';
|
|
$filename = $slug.'-'.$postfix.'.html';
|
|
$user_agent = config('platform.proxy.user_agent');
|
|
$disk_url = $directory.$filename;
|
|
|
|
$raw_html = null;
|
|
$status_code = 0;
|
|
|
|
$costs = [];
|
|
|
|
$main_intervention_image = null;
|
|
$intervention_images = [];
|
|
|
|
$unblocker_proxy_server = get_smartproxy_unblocker_server();
|
|
$rotating_proxy_server = get_smartproxy_rotating_server();
|
|
|
|
try {
|
|
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
|
|
|
|
if (is_null($raw_html)) {
|
|
$status_code = -1;
|
|
throw new Exception('Not stored.');
|
|
}
|
|
} catch (Exception $e) {
|
|
$raw_html = null;
|
|
}
|
|
|
|
if (is_null($raw_html)) {
|
|
|
|
try {
|
|
|
|
$response = Http::withHeaders([
|
|
'User-Agent' => $user_agent,
|
|
])
|
|
->withOptions([
|
|
'proxy' => $unblocker_proxy_server,
|
|
'timeout' => 1000,
|
|
'verify' => false,
|
|
])
|
|
->get($cached_url);
|
|
|
|
if ($response->successful()) {
|
|
$raw_html = $response->body();
|
|
// ... your logic here ...
|
|
} else {
|
|
$raw_html = null;
|
|
$status_code = -3;
|
|
//throw new Exception('Http response failed');
|
|
$response->throw();
|
|
}
|
|
|
|
// $browsershot = new Browsershot();
|
|
|
|
// $browsershot->setUrl($cached_url)
|
|
// ->setOption('args', ['headless: "new"'])
|
|
// ->noSandbox()
|
|
// ->setOption('args', ['--disable-web-security'])
|
|
// ->userAgent($user_agent)
|
|
// ->ignoreHttpsErrors()
|
|
// ->preventUnsuccessfulResponse()
|
|
// ->timeout(10)
|
|
// ->setProxyServer($proxy_server)
|
|
// ->userAgent($user_agent);
|
|
|
|
// if (app()->environment() == 'local') {
|
|
// $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
|
|
// }
|
|
|
|
// //dump($browsershot);
|
|
|
|
// $raw_html = $browsershot->bodyHtml();
|
|
|
|
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
|
|
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
|
|
|
|
// $costs['html'] = $browsershot_cost;
|
|
|
|
} catch (UnsuccessfulResponse|Exception $e) {
|
|
$raw_html = null;
|
|
$status_code = -3;
|
|
throw $e;
|
|
}
|
|
|
|
if (! is_empty($raw_html)) {
|
|
OSSUploader::uploadFile($driver, $directory, $filename, $raw_html);
|
|
$status_code = 1;
|
|
}
|
|
}
|
|
|
|
if (! is_null($raw_html)) {
|
|
|
|
$raw_html = self::minifyAndCleanHtml($raw_html);
|
|
|
|
$jsonld = self::getJsonLd($raw_html);
|
|
|
|
if ($parse_images) {
|
|
$images = self::getImages($raw_html);
|
|
$images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images);
|
|
} else {
|
|
$images = [];
|
|
}
|
|
|
|
$main_image = self::getProductImage($jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image);
|
|
|
|
return (object) [
|
|
'intervention' => (object) compact('main_intervention_image', 'intervention_images'),
|
|
'response' => (object) [
|
|
'url' => $url,
|
|
'postfix' => $postfix,
|
|
'filename' => $disk_url,
|
|
'raw_html' => $raw_html,
|
|
'jsonld' => $jsonld,
|
|
'main_image' => $main_image,
|
|
'images' => $images,
|
|
'status_code' => $status_code,
|
|
'costs' => $costs,
|
|
'total_cost' => array_sum(array_values($costs)),
|
|
],
|
|
];
|
|
}
|
|
|
|
return (object) [
|
|
'response' => (object) [
|
|
'url' => $url,
|
|
'postfix' => $postfix,
|
|
'filename' => null,
|
|
'raw_html' => null,
|
|
'jsonld' => [],
|
|
'main_image' => null,
|
|
'images' => [],
|
|
'status_code' => $status_code,
|
|
'costs' => $costs,
|
|
'total_cost' => 0,
|
|
],
|
|
];
|
|
}
|
|
|
|
private static function getJsonLd(string $raw_html)
|
|
{
|
|
$crawler = new Crawler($raw_html);
|
|
|
|
try {
|
|
$jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) {
|
|
return $node->text();
|
|
});
|
|
} catch (Exception $e) {
|
|
return [];
|
|
}
|
|
|
|
$contents = [];
|
|
|
|
foreach ($jsonld as $content) {
|
|
try {
|
|
$contents[] = json_decode($content);
|
|
} catch (Exception $e) {
|
|
}
|
|
}
|
|
|
|
return $contents;
|
|
}
|
|
|
|
private static function getImages(string $raw_html)
|
|
{
|
|
$crawler = new Crawler($raw_html);
|
|
$images = [];
|
|
|
|
$crawler->filter('img')->each(function ($node) use (&$images) {
|
|
$src = $node->attr('src');
|
|
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
|
|
|
|
$blacklist_domain = [];
|
|
|
|
foreach ($blacklist_domain as $blacklist) {
|
|
if (! str_contains($src, $blacklist)) {
|
|
$images[] = [
|
|
'src' => $src,
|
|
'alt' => $alt,
|
|
];
|
|
}
|
|
}
|
|
});
|
|
|
|
//dd($images);
|
|
|
|
return $images;
|
|
}
|
|
|
|
private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images)
|
|
{
|
|
$filteredImages = [];
|
|
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
|
|
|
|
$count = 0;
|
|
|
|
foreach ($images as $image) {
|
|
$count++;
|
|
|
|
$src = $image['src'];
|
|
|
|
try {
|
|
|
|
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src);
|
|
|
|
// Check if the request was successful
|
|
if (! $response->successful()) {
|
|
continue;
|
|
}
|
|
|
|
$imageData = $response->body();
|
|
|
|
// Create an Intervention Image instance from the response data
|
|
$interventionImage = Image::make($imageData);
|
|
|
|
$width = $interventionImage->width();
|
|
$height = $interventionImage->height();
|
|
$mime = $interventionImage->mime();
|
|
|
|
// Image size in KB
|
|
$sizeKb = round(strlen($imageData) / 1024, 2);
|
|
|
|
// Check constraints
|
|
if ($width < 800 || $height < 800 || $sizeKb < 100) {
|
|
continue;
|
|
}
|
|
$image['width'] = $width;
|
|
$image['height'] = $height;
|
|
$image['mime'] = $mime;
|
|
$image['sizeKb'] = $sizeKb;
|
|
|
|
// Check for duplicates by searching through uniqueAttributes
|
|
$isDuplicate = false;
|
|
foreach ($uniqueAttributes as $attr) {
|
|
if (
|
|
$attr['width'] == $width &&
|
|
$attr['height'] == $height &&
|
|
$attr['mime'] == $mime &&
|
|
abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance
|
|
) {
|
|
$isDuplicate = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (! $isDuplicate) {
|
|
$uniqueAttributes[] = [
|
|
'width' => $width,
|
|
'height' => $height,
|
|
'mime' => $mime,
|
|
'sizeKb' => $sizeKb,
|
|
];
|
|
$image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage);
|
|
//$image['img'] = $interventionImage;
|
|
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb);
|
|
|
|
$filteredImages[] = $image;
|
|
|
|
$intervention_images[] = (object) [
|
|
'image' => $interventionImage,
|
|
'original_name' => pathinfo($src, PATHINFO_BASENAME),
|
|
];
|
|
}
|
|
} catch (\Exception $e) {
|
|
// Handle exceptions related to the HTTP request
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Collect all the color counts
|
|
$colorCounts = [];
|
|
foreach ($filteredImages as $image) {
|
|
$colorCounts[] = $image['color_counts'];
|
|
}
|
|
|
|
if (! empty($colorCounts)) {
|
|
// Compute the median of the color counts
|
|
sort($colorCounts);
|
|
$count = count($colorCounts);
|
|
$middleIndex = floor($count / 2);
|
|
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
|
|
|
|
// Use the median to filter out the low outliers
|
|
$threshold = 0.10 * $median; // Adjust this percentage as needed
|
|
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
|
|
return $image['color_counts'] > $threshold;
|
|
});
|
|
} else {
|
|
// No images found
|
|
$filteredImages = []; // Clear the array or take any other appropriate action
|
|
}
|
|
|
|
usort($filteredImages, function ($a, $b) {
|
|
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
|
|
});
|
|
|
|
return $filteredImages;
|
|
}
|
|
|
|
// private static function isImageMostlyText($imageData, $mime) {
|
|
// try {
|
|
// $text = (new TesseractOCR)->imageData($imageData, $mime)->run();
|
|
// $textLength = strlen($text);
|
|
|
|
// // This is a basic check. Adjust the threshold as needed.
|
|
// return $textLength > 50;
|
|
// } catch (\Exception $e) {
|
|
// // Handle any exceptions related to Tesseract OCR
|
|
// return false;
|
|
// }
|
|
// }
|
|
|
|
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image)
|
|
{
|
|
foreach ($jsonLdData as $data) {
|
|
// Ensure the type is "Product" before proceeding
|
|
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
|
if (isset($data->url) && isset($data->image)) {
|
|
try {
|
|
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
|
|
|
|
// Check if the request was successful
|
|
if ($response->successful()) {
|
|
$imageData = $response->body();
|
|
|
|
// Create an Intervention Image instance from the response data
|
|
$interventionImage = Image::make($imageData);
|
|
|
|
// Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed
|
|
$interventionImage->fit(1920, 1080, function ($constraint) {
|
|
$constraint->upsize();
|
|
$constraint->aspectRatio();
|
|
});
|
|
|
|
$sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes
|
|
|
|
// Calculate the cost
|
|
$cost = calculate_smartproxy_cost($sizeInKb);
|
|
|
|
$costs['product_image'] = $cost;
|
|
|
|
$main_intervention_image = (object) [
|
|
'image' => $interventionImage,
|
|
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
|
|
];
|
|
|
|
return [
|
|
'url' => $data->url,
|
|
//'img' => $interventionImage,
|
|
'cost' => $cost,
|
|
];
|
|
}
|
|
} catch (\Exception $e) {
|
|
// Handle exceptions related to the HTTP request
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private static function isMostlyTextBasedOnUniqueColors($interventionImage)
|
|
{
|
|
// Use Intervention to manipulate the image
|
|
$img = clone $interventionImage;
|
|
|
|
// Resize to a smaller dimension for faster processing (maintaining aspect ratio)
|
|
$img->resize(200, null, function ($constraint) {
|
|
$constraint->aspectRatio();
|
|
});
|
|
|
|
// Apply some blur
|
|
$img->blur(10);
|
|
|
|
$im = imagecreatefromstring($img->encode());
|
|
|
|
$width = imagesx($im);
|
|
$height = imagesy($im);
|
|
|
|
$uniqueColors = [];
|
|
|
|
for ($x = 0; $x < $width; $x++) {
|
|
for ($y = 0; $y < $height; $y++) {
|
|
$rgb = imagecolorat($im, $x, $y);
|
|
$uniqueColors[$rgb] = true;
|
|
}
|
|
}
|
|
|
|
imagedestroy($im);
|
|
|
|
// Adjust the threshold based on your dataset.
|
|
// Here, I'm assuming that images with less than 100 unique colors are mostly text
|
|
// because we've reduced the image size and applied blurring.
|
|
return count($uniqueColors);
|
|
}
|
|
|
|
private static function minifyAndCleanHtml(string $raw_html)
|
|
{
|
|
$raw_html = TinyMinify::html($raw_html);
|
|
|
|
$crawler = new Crawler($raw_html);
|
|
|
|
// Directly loop through the DOM and remove 'class' and 'id' attributes
|
|
foreach ($crawler as $domElement) {
|
|
/** @var \DOMNodeList $nodes */
|
|
$nodes = $domElement->getElementsByTagName('*');
|
|
foreach ($nodes as $node) {
|
|
/** @var \DOMElement $node */
|
|
$node->removeAttribute('class');
|
|
$node->removeAttribute('id');
|
|
$node->removeAttribute('style');
|
|
}
|
|
}
|
|
|
|
// Remove <style> tags and their content
|
|
$styleTags = $domElement->getElementsByTagName('style');
|
|
for ($i = $styleTags->length; --$i >= 0;) {
|
|
$styleNode = $styleTags->item($i);
|
|
$styleNode->parentNode->removeChild($styleNode);
|
|
}
|
|
|
|
// Output the manipulated HTML
|
|
return $crawler->html();
|
|
}
|
|
|
|
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
|
|
{
|
|
$url = self::stripUrlQueryParameters($url);
|
|
$cached_url = "https://webcache.googleusercontent.com/search?q=cache:{$url}";
|
|
|
|
if ($stripHtml) {
|
|
$cached_url .= '&strip=1';
|
|
}
|
|
|
|
return $cached_url;
|
|
|
|
}
|
|
|
|
private static function stripUrlQueryParameters(string $url)
|
|
{
|
|
// Parse the URL into its components
|
|
$parts = parse_url($url);
|
|
|
|
// Rebuild the URL without the query component
|
|
$newUrl = $parts['scheme'].'://'.$parts['host'];
|
|
|
|
if (isset($parts['path'])) {
|
|
$newUrl .= $parts['path'];
|
|
}
|
|
|
|
if (isset($parts['fragment'])) {
|
|
$newUrl .= '#'.$parts['fragment'];
|
|
}
|
|
|
|
return $newUrl;
|
|
}
|
|
}
|