setUrl($cached_url) ->setOption('args', ['headless: "new"']) ->noSandbox() ->setOption('args', ['--disable-web-security']) ->userAgent($user_agent) ->ignoreHttpsErrors() ->preventUnsuccessfulResponse() ->timeout(10) //->setProxyServer($proxy_server) ->userAgent($user_agent); if (app()->environment() == 'local') { $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary')); } //dump($browsershot); $raw_html = $browsershot->bodyHtml(); // $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes // $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ; // $costs['html'] = $browsershot_cost; } catch (UnsuccessfulResponse|Exception $e) { $raw_html = null; $status_code = -3; throw $e; } if (! is_empty($raw_html)) { OSSUploader::uploadFile($driver, $directory, $filename, $raw_html); $status_code = 1; } } if (! is_null($raw_html)) { $raw_html = self::minifyAndCleanHtml($raw_html); $jsonld = self::getJsonLd($raw_html); if ($parse_images) { $images = self::getImages($raw_html); $images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images); } else { $images = []; } $main_image = self::getProductImage($jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image); return (object) [ 'intervention' => (object) compact('main_intervention_image', 'intervention_images'), 'response' => (object) [ 'url' => $url, 'postfix' => $postfix, 'filename' => $disk_url, 'raw_html' => $raw_html, 'jsonld' => $jsonld, 'main_image' => $main_image, 'images' => $images, 'status_code' => $status_code, 'costs' => $costs, 'total_cost' => array_sum(array_values($costs)), ], ]; } return (object) [ 'response' => (object) [ 'url' => $url, 'postfix' => $postfix, 'filename' => null, 'raw_html' => null, 'jsonld' => [], 'main_image' => null, 'images' => [], 'status_code' => $status_code, 'costs' => $costs, 'total_cost' => 0, ], ]; } private static function getJsonLd(string $raw_html) { $crawler = new Crawler($raw_html); try { $jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) { return $node->text(); }); } catch (Exception $e) { return []; } $contents = []; foreach ($jsonld as $content) { try { $contents[] = json_decode($content); } catch (Exception $e) { } } return $contents; } private static function getImages(string $raw_html) { $crawler = new Crawler($raw_html); $images = []; $crawler->filter('img')->each(function ($node) use (&$images) { $src = $node->attr('src'); $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present $images[] = [ 'src' => $src, 'alt' => $alt, ]; }); // if (count($images) > 4) // { // return $images; // } return $images; } private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images) { $filteredImages = []; $uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations $count = 0; foreach ($images as $image) { $count++; $src = $image['src']; try { $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src); // Check if the request was successful if (! $response->successful()) { continue; } $imageData = $response->body(); // Create an Intervention Image instance from the response data $interventionImage = Image::make($imageData); $width = $interventionImage->width(); $height = $interventionImage->height(); $mime = $interventionImage->mime(); // Image size in KB $sizeKb = round(strlen($imageData) / 1024, 2); // Check constraints if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') { continue; } $image['width'] = $width; $image['height'] = $height; $image['mime'] = $mime; $image['sizeKb'] = $sizeKb; // Check for duplicates by searching through uniqueAttributes $isDuplicate = false; foreach ($uniqueAttributes as $attr) { if ( $attr['width'] == $width && $attr['height'] == $height && $attr['mime'] == $mime && abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance ) { $isDuplicate = true; break; } } if (! $isDuplicate) { $uniqueAttributes[] = [ 'width' => $width, 'height' => $height, 'mime' => $mime, 'sizeKb' => $sizeKb, ]; $image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage); //$image['img'] = $interventionImage; $costs['count-'.$count] = calculate_smartproxy_cost($sizeKb); $filteredImages[] = $image; $intervention_images[] = (object) [ 'image' => $interventionImage, 'original_name' => pathinfo($src, PATHINFO_BASENAME), ]; } } catch (\Exception $e) { // Handle exceptions related to the HTTP request continue; } } // Collect all the color counts $colorCounts = []; foreach ($filteredImages as $image) { $colorCounts[] = $image['color_counts']; } // Compute the median of the color counts sort($colorCounts); $count = count($colorCounts); $middleIndex = floor($count / 2); $median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex]; // Use the median to filter out the low outliers $threshold = 0.10 * $median; // Adjust this percentage as needed $filteredImages = array_filter($filteredImages, function ($image) use ($threshold) { return $image['color_counts'] > $threshold; }); usort($filteredImages, function ($a, $b) { return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order }); return $filteredImages; } // private static function isImageMostlyText($imageData, $mime) { // try { // $text = (new TesseractOCR)->imageData($imageData, $mime)->run(); // $textLength = strlen($text); // // This is a basic check. Adjust the threshold as needed. // return $textLength > 50; // } catch (\Exception $e) { // // Handle any exceptions related to Tesseract OCR // return false; // } // } private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image) { foreach ($jsonLdData as $data) { // Ensure the type is "Product" before proceeding if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { if (isset($data->url) && isset($data->image)) { try { $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image); // Check if the request was successful if ($response->successful()) { $imageData = $response->body(); // Create an Intervention Image instance from the response data $interventionImage = Image::make($imageData); // Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed $interventionImage->fit(1920, 1080, function ($constraint) { $constraint->upsize(); $constraint->aspectRatio(); }); $sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes // Calculate the cost $cost = calculate_smartproxy_cost($sizeInKb); $costs['product_image'] = $cost; $main_intervention_image = (object) [ 'image' => $interventionImage, 'original_name' => pathinfo($data->image, PATHINFO_BASENAME), ]; return [ 'url' => $data->url, //'img' => $interventionImage, 'cost' => $cost, ]; } } catch (\Exception $e) { // Handle exceptions related to the HTTP request return null; } } } } return null; } private static function isMostlyTextBasedOnUniqueColors($interventionImage) { // Use Intervention to manipulate the image $img = clone $interventionImage; // Resize to a smaller dimension for faster processing (maintaining aspect ratio) $img->resize(200, null, function ($constraint) { $constraint->aspectRatio(); }); // Apply some blur $img->blur(10); $im = imagecreatefromstring($img->encode()); $width = imagesx($im); $height = imagesy($im); $uniqueColors = []; for ($x = 0; $x < $width; $x++) { for ($y = 0; $y < $height; $y++) { $rgb = imagecolorat($im, $x, $y); $uniqueColors[$rgb] = true; } } imagedestroy($im); // Adjust the threshold based on your dataset. // Here, I'm assuming that images with less than 100 unique colors are mostly text // because we've reduced the image size and applied blurring. return count($uniqueColors); } private static function minifyAndCleanHtml(string $raw_html) { $raw_html = TinyMinify::html($raw_html); $crawler = new Crawler($raw_html); // Directly loop through the DOM and remove 'class' and 'id' attributes foreach ($crawler as $domElement) { /** @var \DOMNodeList $nodes */ $nodes = $domElement->getElementsByTagName('*'); foreach ($nodes as $node) { /** @var \DOMElement $node */ $node->removeAttribute('class'); $node->removeAttribute('id'); $node->removeAttribute('style'); } } // Remove