From 5b4a02778eb2e0b1846caa2ce59b45ff1605345c Mon Sep 17 00:00:00 2001 From: Charles T Date: Sun, 1 Oct 2023 04:17:49 +0800 Subject: [PATCH] Add (ai gen) --- app/Helpers/FirstParty/OpenAI/OpenAI.php | 21 +++- app/Helpers/Global/proxy_helper.php | 34 ++++++ .../Tasks/GenerateShopeeAIArticleTask.php | 64 ++++++---- app/Jobs/Tasks/SaveShopeeSellerImagesTask.php | 32 ++--- .../ShopeeSellerTopProductScraperTask.php | 2 +- app/Jobs/Tasks/UrlCrawlerTask.php | 112 +++++++++++------- config/platform/proxy.php | 10 +- 7 files changed, 191 insertions(+), 84 deletions(-) diff --git a/app/Helpers/FirstParty/OpenAI/OpenAI.php b/app/Helpers/FirstParty/OpenAI/OpenAI.php index 1e681a6..672f241 100644 --- a/app/Helpers/FirstParty/OpenAI/OpenAI.php +++ b/app/Helpers/FirstParty/OpenAI/OpenAI.php @@ -10,9 +10,11 @@ class OpenAI { public static function writeProductArticle($excerpt, $photos) { + $excerpt = substr($excerpt, 0, 900); + $system_prompt = ' You are tasked with writing a comprehensive product introduction article using the provided excerpt. The emphasis should be on the performance, features, and notable aspects of the product. The review should avoid the use of personal pronouns and must not delve into marketplace-related information. Return the output in the following json format:\n\n - {"title": "(Article Title)","excerpt": "(One sentence summary, 150-160 characters of an article, do not use start sentence with verb.)","cliffhanger": "(One sentence 70-80 characters of article, cliff-hanging sentence to attract readers)","body": "(Markdown format, 500-700 word count)"}\n\n + {"title": "(Article Title)","excerpt": "(One sentence summary, 150-160 characters of an article, do not use start sentence with verb.)","cliffhanger": "(One sentence 70-80 characters of article, cliff-hanging sentence to attract readers)","body": "(Markdown format, 700-900 word count)"}\n\n Mandatory Requirements:\n - Write in US grade 8-9 English\n - Use the following sections whenever applicable:\n @@ -24,16 +26,23 @@ public static function writeProductArticle($excerpt, $photos) - do not make up facts, use facts provided by excerpt only\n - No article titles inside markdown\n - All article sections use ### - - Add at least 3 markdown images with article title as caption in every section except for Introduction + '; - $user_prompt = "Excerpt: {$excerpt}\nPhotos:\n"; + $user_prompt = "EXCERPT\n------------\n{$excerpt}\n"; - foreach ($photos as $photo) { - $user_prompt .= "{$photo}\n"; + if (count($photos) > 0) { + $system_prompt .= '- Add at least 3 markdown images with article title as caption in every section except for Introduction'; + $user_prompt .= "\n\nPHOTOS\n------------\n"; + foreach ($photos as $photo) { + $user_prompt .= "{$photo}\n"; + } } - $output = (self::chatCompletion($system_prompt, $user_prompt, 'gpt-3.5-turbo', 2000)); + $output = (self::chatCompletion($system_prompt, $user_prompt, 'gpt-3.5-turbo', 2500)); + + // dump($user_prompt); + // dd($output); if (! is_null($output)) { try { diff --git a/app/Helpers/Global/proxy_helper.php b/app/Helpers/Global/proxy_helper.php index cfba7f2..23eeaff 100644 --- a/app/Helpers/Global/proxy_helper.php +++ b/app/Helpers/Global/proxy_helper.php @@ -1,5 +1,39 @@ setWordThreshold(20); - $readability = new Readability($r_configuration); + $html_content = ''; - // try { - // $readability->parse($raw_html); + try { - // $html_content = $readability->getContent(); + $r_configuration = new ReadabilityConfiguration(); + $r_configuration->setWordThreshold(20); - // // Remove tabs - // $html_content = str_replace("\t", '', $html_content); + $readability = new Readability($r_configuration); - // // Replace newlines with spaces - // $html_content = str_replace(["\n", "\r\n"], ' ', $html_content); + $readability->parse($raw_html); - // // Replace multiple spaces with a single space - // $html_content = preg_replace('/\s+/', ' ', $html_content); + $temp_html_content = $readability->getContent(); - // // Output the cleaned text - // $html_content = trim($html_content); // Using trim to remove any leading or trailing spaces + // Remove tabs + $temp_html_content = str_replace("\t", '', $temp_html_content); - // $html_content = strip_tags($html_content); + // Replace newlines with spaces + $temp_html_content = str_replace(["\n", "\r\n"], ' ', $temp_html_content); - // } catch (ReadabilityParseException|Exception $e) { + // Replace multiple spaces with a single space + $temp_html_content = preg_replace('/\s+/', ' ', $temp_html_content); - $html5 = new HTML5(['preserveWhiteSpace' => true]); + // Output the cleaned text + $temp_html_content = trim($temp_html_content); // Using trim to remove any leading or trailing spaces - // Parse the HTML into a DOM tree. - $dom = $html5->loadHTML($raw_html); + $temp_html_content = strip_tags($temp_html_content); - // Serialize the DOM tree back to a string, formatted. - $html_content = strip_tags($html5->saveHTML($dom)); + $crawler = new Crawler($raw_html); - // } + // Extract meta title + $title = $crawler->filter('title')->text(); // This assumes tags are used for titles. + + // Extract meta description + $metaDescriptionNode = $crawler->filter('meta[name="description"]'); + $description = $metaDescriptionNode->count() > 0 ? $metaDescriptionNode->attr('content') : null; + + $html_content .= $title.' '; + $html_content .= $description.' '; + $html_content .= $temp_html_content; + + } catch (ReadabilityParseException|Exception $e) { + + $html5 = new HTML5(['preserveWhiteSpace' => true]); + + // Parse the HTML into a DOM tree. + $dom = $html5->loadHTML($raw_html); + + // Serialize the DOM tree back to a string, formatted. + $html_content = strip_tags($html5->saveHTML($dom)); + + } return $html_content; } diff --git a/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php b/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php index 6af5080..9b997de 100644 --- a/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php +++ b/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php @@ -19,7 +19,9 @@ public static function handle($shopee_task) $main_image_url = null; - $proxy_server = get_smartproxy_server(); + $unblocker_proxy_server = get_smartproxy_unblocker_server(); + $rotating_proxy_server = get_smartproxy_rotating_server(); + $user_agent = config('platform.proxy.user_agent'); ///////// PART 1 @@ -36,7 +38,7 @@ public static function handle($shopee_task) $intervention_images = $shopee_task->product_task->intervention->intervention_images; } else { $images = self::getImages($shopee_task->product_task->response->raw_html); - $images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images); + $images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images); } ///////// PART 2 @@ -54,7 +56,7 @@ public static function handle($shopee_task) $scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); if (is_null($scraped_image)) { - $main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image); + $main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image); $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true); } @@ -137,16 +139,18 @@ private static function getImages(string $raw_html) $crawler->filter('img')->each(function ($node) use (&$images) { $src = $node->attr('src'); $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present - $images[] = [ - 'src' => $src, - 'alt' => $alt, - ]; - }); - // if (count($images) > 4) - // { - // return $images; - // } + $blacklist_domain = []; + + foreach ($blacklist_domain as $blacklist) { + if (! str_contains($src, $blacklist)) { + $images[] = [ + 'src' => $src, + 'alt' => $alt, + ]; + } + } + }); return $images; } @@ -164,7 +168,7 @@ private static function filterImages(array $images, string $proxy, string $user_ $src = $image['src']; try { - $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src); + $response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src); // Check if the request was successful if (! $response->successful()) { @@ -274,7 +278,7 @@ private static function getProductImage(array $jsonLdData, string $proxy, string if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { if (isset($data->url) && isset($data->image)) { try { - $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image); + $response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image); // Check if the request was successful if ($response->successful()) { diff --git a/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php b/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php index eb5f6c9..5f2a071 100644 --- a/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php +++ b/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php @@ -41,7 +41,7 @@ public static function handle(string $seller, string $country_iso, Category $cat //dd($seller_shop_task); if (isset($seller_shop_task->response->jsonld)) { - $top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 100); + $top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 400); if (count($top_rank_products) > 0) { diff --git a/app/Jobs/Tasks/UrlCrawlerTask.php b/app/Jobs/Tasks/UrlCrawlerTask.php index 6201924..d9c6972 100644 --- a/app/Jobs/Tasks/UrlCrawlerTask.php +++ b/app/Jobs/Tasks/UrlCrawlerTask.php @@ -18,7 +18,7 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h { $slug = str_slug($url); - $cached_url = self::getGoogleCachedUrl($url, false); + $cached_url = $url; // self::getGoogleCachedUrl($url, false); $postfix = strval($postfix); @@ -35,7 +35,8 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h $main_intervention_image = null; $intervention_images = []; - $proxy_server = get_smartproxy_server(); + $unblocker_proxy_server = get_smartproxy_unblocker_server(); + $rotating_proxy_server = get_smartproxy_rotating_server(); try { $raw_html = OSSUploader::readFile($driver, $directory, $filename); @@ -51,26 +52,47 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h if (is_null($raw_html)) { try { - $browsershot = new Browsershot(); - $browsershot->setUrl($cached_url) - ->setOption('args', ['headless: "new"']) - ->noSandbox() - ->setOption('args', ['--disable-web-security']) - ->userAgent($user_agent) - ->ignoreHttpsErrors() - ->preventUnsuccessfulResponse() - ->timeout(10) - //->setProxyServer($proxy_server) - ->userAgent($user_agent); + $response = Http::withHeaders([ + 'User-Agent' => $user_agent, + ]) + ->withOptions([ + 'proxy' => $unblocker_proxy_server, + 'timeout' => 1000, + 'verify' => false, + ]) + ->get($cached_url); - if (app()->environment() == 'local') { - $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary')); + if ($response->successful()) { + $raw_html = $response->body(); + // ... your logic here ... + } else { + $raw_html = null; + $status_code = -3; + //throw new Exception('Http response failed'); + $response->throw(); } - //dump($browsershot); + // $browsershot = new Browsershot(); - $raw_html = $browsershot->bodyHtml(); + // $browsershot->setUrl($cached_url) + // ->setOption('args', ['headless: "new"']) + // ->noSandbox() + // ->setOption('args', ['--disable-web-security']) + // ->userAgent($user_agent) + // ->ignoreHttpsErrors() + // ->preventUnsuccessfulResponse() + // ->timeout(10) + // ->setProxyServer($proxy_server) + // ->userAgent($user_agent); + + // if (app()->environment() == 'local') { + // $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary')); + // } + + // //dump($browsershot); + + // $raw_html = $browsershot->bodyHtml(); // $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes // $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ; @@ -97,12 +119,12 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h if ($parse_images) { $images = self::getImages($raw_html); - $images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images); + $images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images); } else { $images = []; } - $main_image = self::getProductImage($jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image); + $main_image = self::getProductImage($jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image); return (object) [ 'intervention' => (object) compact('main_intervention_image', 'intervention_images'), @@ -169,16 +191,20 @@ private static function getImages(string $raw_html) $crawler->filter('img')->each(function ($node) use (&$images) { $src = $node->attr('src'); $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present - $images[] = [ - 'src' => $src, - 'alt' => $alt, - ]; + + $blacklist_domain = []; + + foreach ($blacklist_domain as $blacklist) { + if (! str_contains($src, $blacklist)) { + $images[] = [ + 'src' => $src, + 'alt' => $alt, + ]; + } + } }); - // if (count($images) > 4) - // { - // return $images; - // } + //dd($images); return $images; } @@ -196,7 +222,8 @@ private static function filterImages(array $images, string $proxy, string $user_ $src = $image['src']; try { - $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src); + + $response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src); // Check if the request was successful if (! $response->successful()) { @@ -216,7 +243,7 @@ private static function filterImages(array $images, string $proxy, string $user_ $sizeKb = round(strlen($imageData) / 1024, 2); // Check constraints - if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') { + if ($width < 800 || $height < 800 || $sizeKb < 100) { continue; } $image['width'] = $width; @@ -268,17 +295,22 @@ private static function filterImages(array $images, string $proxy, string $user_ $colorCounts[] = $image['color_counts']; } - // Compute the median of the color counts - sort($colorCounts); - $count = count($colorCounts); - $middleIndex = floor($count / 2); - $median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex]; + if (! empty($colorCounts)) { + // Compute the median of the color counts + sort($colorCounts); + $count = count($colorCounts); + $middleIndex = floor($count / 2); + $median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex]; - // Use the median to filter out the low outliers - $threshold = 0.10 * $median; // Adjust this percentage as needed - $filteredImages = array_filter($filteredImages, function ($image) use ($threshold) { - return $image['color_counts'] > $threshold; - }); + // Use the median to filter out the low outliers + $threshold = 0.10 * $median; // Adjust this percentage as needed + $filteredImages = array_filter($filteredImages, function ($image) use ($threshold) { + return $image['color_counts'] > $threshold; + }); + } else { + // No images found + $filteredImages = []; // Clear the array or take any other appropriate action + } usort($filteredImages, function ($a, $b) { return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order @@ -307,7 +339,7 @@ private static function getProductImage(array $jsonLdData, string $proxy, string if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { if (isset($data->url) && isset($data->image)) { try { - $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image); + $response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image); // Check if the request was successful if ($response->successful()) { diff --git a/config/platform/proxy.php b/config/platform/proxy.php index 41901ac..e5bf1e2 100644 --- a/config/platform/proxy.php +++ b/config/platform/proxy.php @@ -11,7 +11,15 @@ 'server' => 'gate.smartproxy.com:7000', 'reproxy' => '157.230.194.206:7000', 'reproxy_enable' => false, - 'cost_per_gb' => 7.00, + 'cost_per_gb' => 7, + ], + 'unblocker' => [ + 'user' => 'U0000123412', + 'password' => 'P$W1bda906aee53c2022d94e22ff1a1142a1', + 'server' => 'unblock.smartproxy.com:60000', + 'reproxy' => '157.230.194.206:7000', + 'reproxy_enable' => false, + 'cost_per_gb' => 20.14, ], ], ];