diff --git a/app/Helpers/FirstParty/OpenAI/OpenAI.php b/app/Helpers/FirstParty/OpenAI/OpenAI.php index 672f241..ecae975 100644 --- a/app/Helpers/FirstParty/OpenAI/OpenAI.php +++ b/app/Helpers/FirstParty/OpenAI/OpenAI.php @@ -20,7 +20,7 @@ public static function writeProductArticle($excerpt, $photos) - Use the following sections whenever applicable:\n -- ### Introduction\n -- ### Overview\n - -- ### Specifications (use valid Markdown table format with header and seperator when possible) and explanation\n + -- ### Specifications (use valid Markdown table format with header and seperator) + explanation\n -- ### Price\n -- ### Should I Buy?\n - do not make up facts, use facts provided by excerpt only\n diff --git a/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php b/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php index e3587eb..4fe3d71 100644 --- a/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php +++ b/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php @@ -34,6 +34,8 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape) $shopee_task->shopee_seller_scrape = $shopee_seller_scrape; } + + // dd($shopee_task); // dd($shopee_task->product_task->response); @@ -42,6 +44,8 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape) $excerpt = self::stripHtml($raw_html); + $excerpt .= self::getProductPricingExcerpt($shopee_task->product_task->response->jsonld); + $photos = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', false)->orderByRaw('RAND()')->take(3)->get()->pluck('image')->toArray(); $ai_writeup = AiWriteup::where('source', 'shopee')->where('source_url', $shopee_task->product_task->response->url)->first(); @@ -120,12 +124,38 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape) return $post; } + private static function getProductPricingExcerpt(array $jsonLdData) + { + foreach ($jsonLdData as $data) { + // Ensure the type is "Product" before proceeding + if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { + + // Extract necessary data + $lowPrice = $data->offers->lowPrice ?? null; + $highPrice = $data->offers->highPrice ?? null; + $price = $data->offers->price ?? null; + $currency = $data->offers->priceCurrency ?? null; + $sellerName = $data->offers->seller->name ?? "online store"; // default to "online store" if name is not set + + // Determine and format pricing sentence + if ($lowPrice && $highPrice) { + return "Price Range from {$currency} {$lowPrice} to {$highPrice} in {$sellerName}"; + } elseif ($price) { + return "Priced at {$currency} {$price} in {$sellerName}"; + } else { + return "Price not stated, refer to {$sellerName}"; + } + + } + } + } + private static function getTotalServiceCost($shopee_task) { $cost = 0.00; - $cost += 0.06; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens + $cost += 0.09; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens // Shopee Seller Scraping if (isset($shopee_task?->seller_shop_task?->response?->total_cost)) { diff --git a/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php b/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php index 6ecd462..f55540b 100644 --- a/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php +++ b/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php @@ -8,68 +8,46 @@ use Illuminate\Support\Str; use Intervention\Image\Facades\Image; use Symfony\Component\DomCrawler\Crawler; +use Masterminds\HTML5; + class SaveShopeeSellerImagesTask { public static function handle($shopee_task) { - $main_intervention_image = null; - $intervention_images = []; - $costs = []; - $main_image_url = null; $unblocker_proxy_server = get_smartproxy_unblocker_server(); $rotating_proxy_server = get_smartproxy_rotating_server(); - + $costs = []; $user_agent = config('platform.proxy.user_agent'); ///////// PART 1 + $main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld); - // If there is a main intervention image, then set in, else get the url only. - if (isset($shopee_task?->product_task?->intervention?->main_intervention_image)) { - $main_intervention_image = $shopee_task->product_task->intervention->main_intervention_image; - } else { - $main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld); - } - - // If there is other image interventions set, then set in, else get the image urls only. - if (isset($shopee_task?->product_task?->intervention?->intervention_images)) { - $intervention_images = $shopee_task->product_task->intervention->intervention_images; - } else { - $images = self::getImages($shopee_task->product_task->response->raw_html); - $images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images); - } - - ///////// PART 2 - - // Check existence and upload if image intervention is set - if (! is_null($main_intervention_image)) { - $scraped_image = ShopeeSellerScrapedImage::where('original_name', $main_intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); - - if (is_null($scraped_image)) { - $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true); - } - } // if there is no main image intervention but the main image url is provided - elseif (! is_empty($main_image_url)) { + if (! is_empty($main_image_url)) { $scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); if (is_null($scraped_image)) { - $main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image); + $main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs); - $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true); + $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_image, true); } } - /////// PART 3 + /////// PART 2 - if (! is_null($intervention_images) && is_array($intervention_images) && count($intervention_images) > 0) { - foreach ($intervention_images as $intervention_image) { - $scraped_image = ShopeeSellerScrapedImage::where('original_name', $intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); + $images = self::getFilteredImages($shopee_task->product_task->response->raw_html, $rotating_proxy_server, $user_agent, $costs); + + //dd($images); + + if (! is_null($images) && is_array($images) && count($images) > 0) { + foreach ($images as $image_obj) { + $scraped_image = ShopeeSellerScrapedImage::where('original_name', $image_obj->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); if (is_null($scraped_image)) { - $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $intervention_image, false); + $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $image_obj, false); } } } @@ -78,7 +56,7 @@ public static function handle($shopee_task) } - private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $intervention_image, $featured = false) + private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $image_obj, $featured = false) { // Generate a unique filename for the uploaded file and LQIP version $uuid = Str::uuid()->toString(); @@ -86,7 +64,7 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv $lqipFileName = time().'_'.$uuid.'_lqip.jpg'; // Convert the file to JPEG format using Intervention Image library - $image = $intervention_image->image; + $image = $image_obj->intervention; // Get the original image width and height $originalWidth = $image->width(); @@ -102,7 +80,7 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv // Save the original image to a temporary file and open it again $tempImagePath = tempnam(sys_get_temp_dir(), 'temp_image'); - file_put_contents($tempImagePath, $intervention_image->image->encode()); + file_put_contents($tempImagePath, $image_obj->intervention->encode()); $clonedImage = Image::make($tempImagePath); // Create the LQIP version of the image using a small size while maintaining the aspect ratio @@ -120,7 +98,7 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv $scraped_image = new ShopeeSellerScrapedImage; $scraped_image->shopee_seller_scrape_id = $shopee_seller_scrape->id; - $scraped_image->original_name = $intervention_image->original_name; + $scraped_image->original_name = $image_obj->original_name; $scraped_image->image = $url; $scraped_image->featured = $featured; @@ -131,32 +109,36 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv return null; } - private static function getImages(string $raw_html) + private static function getImageUrls(string $raw_html) { - $crawler = new Crawler($raw_html); $images = []; - $crawler->filter('img')->each(function ($node) use (&$images) { - $src = $node->attr('src'); - $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present + // Pattern for extracting src and alt attributes from img tags + $pattern = '/]*>/is'; - $blacklist_domain = []; + if (preg_match_all($pattern, $raw_html, $matches, PREG_SET_ORDER)) { + foreach ($matches as $match) { + $src = $match[1]; - foreach ($blacklist_domain as $blacklist) { - if (! str_contains($src, $blacklist)) { - $images[] = [ - 'src' => $src, - 'alt' => $alt, - ]; - } + // Check if image file name ends with '_tn' and remove it + $src = preg_replace('/_tn(\.[a-z]+)?$/i', '$1', $src); + + $images[] = [ + 'src' => $src, + 'alt' => isset($match[2]) ? $match[2] : null, + ]; } - }); + } return $images; } - private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images) + private static function getFilteredImages(string $raw_html, string $proxy, string $user_agent, &$costs) { + $images = self::getImageUrls($raw_html); + + //dd($images); + $filteredImages = []; $uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations @@ -188,7 +170,7 @@ private static function filterImages(array $images, string $proxy, string $user_ $sizeKb = round(strlen($imageData) / 1024, 2); // Check constraints - if ($width < 300 || $height < 300) { + if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') { continue; } @@ -225,16 +207,15 @@ private static function filterImages(array $images, string $proxy, string $user_ 'mime' => $mime, 'sizeKb' => $sizeKb, ]; - $image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage); + $image['color_counts'] = self::getImageColorCounts($interventionImage); + + $image['intervention'] = $interventionImage; + $image['original_name'] = pathinfo($src, PATHINFO_BASENAME); + //$image['img'] = $interventionImage; $costs['count-'.$count] = calculate_smartproxy_cost($sizeKb, 'rotating_global'); $filteredImages[] = $image; - - $intervention_images[] = (object) [ - 'image' => $interventionImage, - 'original_name' => pathinfo($src, PATHINFO_BASENAME), - ]; } } catch (\Exception $e) { // Handle exceptions related to the HTTP request @@ -269,7 +250,14 @@ private static function filterImages(array $images, string $proxy, string $user_ return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order }); - return $filteredImages; + $final_images = []; + + foreach ($filteredImages as $image_obj) + { + $final_images[] = (object) $image_obj; + } + + return $final_images; } private static function getProductImageUrl(array $jsonLdData) @@ -284,7 +272,7 @@ private static function getProductImageUrl(array $jsonLdData) } } - private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image) + private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs) { foreach ($jsonLdData as $data) { // Ensure the type is "Product" before proceeding @@ -313,14 +301,10 @@ private static function getProductImage(array $jsonLdData, string $proxy, string $costs['product_image'] = $cost; - $main_intervention_image = (object) [ - 'image' => $interventionImage, - 'original_name' => pathinfo($data->image, PATHINFO_BASENAME), - ]; - - return [ + return (object) [ 'url' => $data->url, - //'img' => $interventionImage, + 'intervention' => $interventionImage, + 'original_name' => pathinfo($data->image, PATHINFO_BASENAME), 'cost' => $cost, ]; } @@ -335,7 +319,7 @@ private static function getProductImage(array $jsonLdData, string $proxy, string return null; } - private static function isMostlyTextBasedOnUniqueColors($interventionImage) + private static function getImageColorCounts($interventionImage) { // Use Intervention to manipulate the image $img = clone $interventionImage; diff --git a/app/Jobs/Tasks/UrlCrawlerTask.php b/app/Jobs/Tasks/UrlCrawlerTask.php index 4ba641e..97c3cbb 100644 --- a/app/Jobs/Tasks/UrlCrawlerTask.php +++ b/app/Jobs/Tasks/UrlCrawlerTask.php @@ -32,9 +32,6 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h $costs = []; - $main_intervention_image = null; - $intervention_images = []; - $unblocker_proxy_server = get_smartproxy_unblocker_server(); $rotating_proxy_server = get_smartproxy_rotating_server(); @@ -63,14 +60,13 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h ]) ->get($cached_url); - if ($response->successful()) { - //$costs['html_proxy'] = calculate_smartproxy_cost() + if ($response->successful()) + { $raw_html = $response->body(); - // ... your logic here ... + $costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'unblocker'); } else { $raw_html = null; $status_code = -3; - //throw new Exception('Http response failed'); $response->throw(); } @@ -114,29 +110,17 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h if (! is_null($raw_html)) { - $raw_html = self::minifyAndCleanHtml($raw_html); + //$raw_html = self::minifyAndCleanHtml($raw_html); $jsonld = self::getJsonLd($raw_html); - if ($parse_images) { - $images = self::getImages($raw_html); - $images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images); - } else { - $images = []; - } - - $main_image = self::getProductImage($jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image); - return (object) [ - 'intervention' => (object) compact('main_intervention_image', 'intervention_images'), 'response' => (object) [ 'url' => $url, 'postfix' => $postfix, 'filename' => $disk_url, 'raw_html' => $raw_html, 'jsonld' => $jsonld, - 'main_image' => $main_image, - 'images' => $images, 'status_code' => $status_code, 'costs' => $costs, 'total_cost' => array_sum(array_values($costs)), @@ -151,8 +135,6 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h 'filename' => null, 'raw_html' => null, 'jsonld' => [], - 'main_image' => null, - 'images' => [], 'status_code' => $status_code, 'costs' => $costs, 'total_cost' => 0, @@ -184,252 +166,9 @@ private static function getJsonLd(string $raw_html) return $contents; } - private static function getImages(string $raw_html) - { - $crawler = new Crawler($raw_html); - $images = []; - - $crawler->filter('img')->each(function ($node) use (&$images) { - $src = $node->attr('src'); - $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present - - $blacklist_domain = []; - - foreach ($blacklist_domain as $blacklist) { - if (! str_contains($src, $blacklist)) { - $images[] = [ - 'src' => $src, - 'alt' => $alt, - ]; - } - } - }); - - //dd($images); - - return $images; - } - - private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images) - { - $filteredImages = []; - $uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations - - $count = 0; - - foreach ($images as $image) { - $count++; - - $src = $image['src']; - - try { - - $response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src); - - // Check if the request was successful - if (! $response->successful()) { - continue; - } - - $imageData = $response->body(); - - // Create an Intervention Image instance from the response data - $interventionImage = Image::make($imageData); - - $width = $interventionImage->width(); - $height = $interventionImage->height(); - $mime = $interventionImage->mime(); - - // Image size in KB - $sizeKb = round(strlen($imageData) / 1024, 2); - - // Check constraints - if ($width < 300 || $height < 300) { - continue; - } - - $interventionImage->resize(800, null, function ($constraint) { - $constraint->aspectRatio(); - }); - $width = $interventionImage->width(); - $height = $interventionImage->height(); - $mime = $interventionImage->mime(); - - $image['width'] = $width; - $image['height'] = $height; - $image['mime'] = $mime; - $image['sizeKb'] = $sizeKb; - - // Check for duplicates by searching through uniqueAttributes - $isDuplicate = false; - foreach ($uniqueAttributes as $attr) { - if ( - $attr['width'] == $width && - $attr['height'] == $height && - $attr['mime'] == $mime && - abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance - ) { - $isDuplicate = true; - break; - } - } - - if (! $isDuplicate) { - $uniqueAttributes[] = [ - 'width' => $width, - 'height' => $height, - 'mime' => $mime, - 'sizeKb' => $sizeKb, - ]; - $image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage); - //$image['img'] = $interventionImage; - $costs['count-'.$count] = calculate_smartproxy_cost($sizeKb, 'rotating_global'); - - $filteredImages[] = $image; - - $intervention_images[] = (object) [ - 'image' => $interventionImage, - 'original_name' => pathinfo($src, PATHINFO_BASENAME), - ]; - } - } catch (\Exception $e) { - // Handle exceptions related to the HTTP request - continue; - } - } - - // Collect all the color counts - $colorCounts = []; - foreach ($filteredImages as $image) { - $colorCounts[] = $image['color_counts']; - } - - if (! empty($colorCounts)) { - // Compute the median of the color counts - sort($colorCounts); - $count = count($colorCounts); - $middleIndex = floor($count / 2); - $median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex]; - - // Use the median to filter out the low outliers - $threshold = 0.10 * $median; // Adjust this percentage as needed - $filteredImages = array_filter($filteredImages, function ($image) use ($threshold) { - return $image['color_counts'] > $threshold; - }); - } else { - // No images found - $filteredImages = []; // Clear the array or take any other appropriate action - } - - usort($filteredImages, function ($a, $b) { - return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order - }); - - return $filteredImages; - } - - // private static function isImageMostlyText($imageData, $mime) { - // try { - // $text = (new TesseractOCR)->imageData($imageData, $mime)->run(); - // $textLength = strlen($text); - - // // This is a basic check. Adjust the threshold as needed. - // return $textLength > 50; - // } catch (\Exception $e) { - // // Handle any exceptions related to Tesseract OCR - // return false; - // } - // } - - private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image) - { - foreach ($jsonLdData as $data) { - // Ensure the type is "Product" before proceeding - if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { - if (isset($data->url) && isset($data->image)) { - try { - $response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image); - - // Check if the request was successful - if ($response->successful()) { - $imageData = $response->body(); - - // Create an Intervention Image instance from the response data - $interventionImage = Image::make($imageData); - - // Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed - $interventionImage->fit(1920, 1080, function ($constraint) { - $constraint->upsize(); - $constraint->aspectRatio(); - }); - - $sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes - - // Calculate the cost - $cost = calculate_smartproxy_cost($sizeInKb, 'rotating_global'); - - $costs['product_image'] = $cost; - - $main_intervention_image = (object) [ - 'image' => $interventionImage, - 'original_name' => pathinfo($data->image, PATHINFO_BASENAME), - ]; - - return [ - 'url' => $data->url, - //'img' => $interventionImage, - 'cost' => $cost, - ]; - } - } catch (\Exception $e) { - // Handle exceptions related to the HTTP request - return null; - } - } - } - } - - return null; - } - - private static function isMostlyTextBasedOnUniqueColors($interventionImage) - { - // Use Intervention to manipulate the image - $img = clone $interventionImage; - - // Resize to a smaller dimension for faster processing (maintaining aspect ratio) - $img->resize(200, null, function ($constraint) { - $constraint->aspectRatio(); - }); - - // Apply some blur - $img->blur(10); - - $im = imagecreatefromstring($img->encode()); - - $width = imagesx($im); - $height = imagesy($im); - - $uniqueColors = []; - - for ($x = 0; $x < $width; $x++) { - for ($y = 0; $y < $height; $y++) { - $rgb = imagecolorat($im, $x, $y); - $uniqueColors[$rgb] = true; - } - } - - imagedestroy($im); - - // Adjust the threshold based on your dataset. - // Here, I'm assuming that images with less than 100 unique colors are mostly text - // because we've reduced the image size and applied blurring. - return count($uniqueColors); - } - private static function minifyAndCleanHtml(string $raw_html) { - $raw_html = TinyMinify::html($raw_html); + $raw_html = self::minifyHTML($raw_html); $crawler = new Crawler($raw_html); @@ -456,6 +195,16 @@ private static function minifyAndCleanHtml(string $raw_html) return $crawler->html(); } + private static function minifyHTML($input) { + // Remove extra white space between HTML tags + $input = preg_replace('/>\s+<', $input); + + // Remove comments + $input = preg_replace('//', '', $input); + + return $input; + } + private static function getGoogleCachedUrl(string $url, $stripHtml = false) { $url = self::stripUrlQueryParameters($url);