From f02081e30c36acfdb362f3ce8c87e101ead0fc66 Mon Sep 17 00:00:00 2001 From: Charles T Date: Sun, 1 Oct 2023 00:35:27 +0800 Subject: [PATCH] Add (ai gen) --- .env.example | 3 + app/Exceptions/Handler.php | 39 +- .../FirstParty/OSSUploader/OSSUploader.php | 67 + app/Helpers/FirstParty/OpenAI/OpenAI.php | 83 + app/Helpers/Global/helpers.php | 1 + app/Helpers/Global/proxy_helper.php | 33 + app/Helpers/Global/string_helper.php | 21 + .../Tests/ScraperTestController.php | 31 + app/Jobs/ShopeeSellerTopProductScraperJob.php | 55 + .../Tasks/GenerateShopeeAIArticleTask.php | 181 ++ app/Jobs/Tasks/SaveShopeeSellerImagesTask.php | 355 ++++ .../ShopeeSellerTopProductScraperTask.php | 133 ++ app/Jobs/Tasks/UrlCrawlerTask.php | 448 +++++ app/Models/AiWriteup.php | 55 + app/Models/Category.php | 2 +- app/Models/CountryLocale.php | 2 +- app/Models/Post.php | 111 +- app/Models/ShopeeSellerScrape.php | 53 + app/Models/ShopeeSellerScrapedImage.php | 44 + app/Providers/RouteServiceProvider.php | 5 + close.sh | 3 + composer.json | 10 + composer.lock | 1679 ++++++++++------ config/markdown.php | 156 ++ config/platform/ai.php | 9 + config/platform/general.php | 6 + config/platform/proxy.php | 17 + config/queue.php | 8 + .../2023_09_28_183221_create_jobs_table.php | 32 + ...805_create_shopee_seller_scrapes_table.php | 35 + ...ate_shopee_seller_scraped_images_table.php | 33 + ..._09_30_060521_create_ai_writeups_table.php | 38 + ...600_updated_editor_enum_in_posts_table.php | 24 + dev.sh | 18 + package-lock.json | 1764 ++++++++++++++--- package.json | 1 + run_prod.sh => prod.sh | 0 .../build/assets/NativeImageBlock-610fd8da.js | 1 - .../assets/NativeImageBlock-610fd8da.js.gz | Bin 3226 -> 0 bytes .../build/assets/NativeImageBlock-78162560.js | 1 + .../assets/NativeImageBlock-78162560.js.gz | Bin 0 -> 3218 bytes ...tor-8fdf28c3.js => PostEditor-5f10a2ff.js} | 2 +- public/build/assets/PostEditor-5f10a2ff.js.gz | Bin 0 -> 66347 bytes public/build/assets/PostEditor-8fdf28c3.js.gz | Bin 66348 -> 0 bytes public/build/assets/VueEditorJs-8bfa8291.js | 83 + .../build/assets/VueEditorJs-8bfa8291.js.gz | Bin 0 -> 57828 bytes public/build/assets/VueEditorJs-fc69dbb5.js | 83 - .../build/assets/VueEditorJs-fc69dbb5.js.gz | Bin 57074 -> 0 bytes public/build/assets/admin-app-62da08c5.js | 19 + public/build/assets/admin-app-62da08c5.js.gz | Bin 0 -> 108687 bytes public/build/assets/admin-app-6630652e.css | 1 + public/build/assets/admin-app-6630652e.css.gz | Bin 0 -> 66448 bytes public/build/assets/admin-app-bade20ce.css | 1 - public/build/assets/admin-app-bade20ce.css.gz | Bin 65990 -> 0 bytes public/build/assets/admin-app-c0ef582d.js | 17 - public/build/assets/admin-app-c0ef582d.js.gz | Bin 108462 -> 0 bytes .../assets/bootstrap-icons-4d4572ef.woff | Bin 0 -> 176200 bytes .../assets/bootstrap-icons-999550fa.woff | Bin 164360 -> 0 bytes .../assets/bootstrap-icons-bacd70af.woff2 | Bin 0 -> 130608 bytes .../assets/bootstrap-icons-cfe45b98.woff2 | Bin 121340 -> 0 bytes ...{bundle-dbffa4bb.js => bundle-13ffaba5.js} | 2 +- public/build/assets/bundle-13ffaba5.js.gz | Bin 0 -> 4813 bytes public/build/assets/bundle-7b5ccf90.js.gz | Bin 5412 -> 0 bytes ...{bundle-7b5ccf90.js => bundle-9b767e03.js} | 2 +- public/build/assets/bundle-9b767e03.js.gz | Bin 0 -> 5412 bytes public/build/assets/bundle-dbffa4bb.js.gz | Bin 4813 -> 0 bytes public/build/assets/front-app-0cdc6a38.js.gz | Bin 16999 -> 0 bytes public/build/assets/front-app-1a35e3f2.css | 9 - public/build/assets/front-app-1a35e3f2.css.gz | Bin 43543 -> 0 bytes ...-app-0cdc6a38.js => front-app-b716c47a.js} | 6 +- public/build/assets/front-app-b716c47a.js.gz | Bin 0 -> 17064 bytes public/build/assets/front-app-f0b54e22.css | 9 + public/build/assets/front-app-f0b54e22.css.gz | Bin 0 -> 44195 bytes public/build/manifest.json | 40 +- public/build/manifest.json.gz | Bin 569 -> 572 bytes resources/views/admin/posts/manage.blade.php | 9 +- resources/views/front/post.blade.php | 6 +- routes/tests.php | 7 + run_dev.sh | 4 - 79 files changed, 4816 insertions(+), 1041 deletions(-) create mode 100644 app/Helpers/FirstParty/OSSUploader/OSSUploader.php create mode 100644 app/Helpers/FirstParty/OpenAI/OpenAI.php create mode 100644 app/Helpers/Global/proxy_helper.php create mode 100644 app/Http/Controllers/Tests/ScraperTestController.php create mode 100644 app/Jobs/ShopeeSellerTopProductScraperJob.php create mode 100644 app/Jobs/Tasks/GenerateShopeeAIArticleTask.php create mode 100644 app/Jobs/Tasks/SaveShopeeSellerImagesTask.php create mode 100644 app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php create mode 100644 app/Jobs/Tasks/UrlCrawlerTask.php create mode 100644 app/Models/AiWriteup.php create mode 100644 app/Models/ShopeeSellerScrape.php create mode 100644 app/Models/ShopeeSellerScrapedImage.php create mode 100644 close.sh create mode 100644 config/markdown.php create mode 100644 config/platform/ai.php create mode 100644 config/platform/proxy.php create mode 100644 database/migrations/2023_09_28_183221_create_jobs_table.php create mode 100644 database/migrations/2023_09_29_123805_create_shopee_seller_scrapes_table.php create mode 100644 database/migrations/2023_09_29_144924_create_shopee_seller_scraped_images_table.php create mode 100644 database/migrations/2023_09_30_060521_create_ai_writeups_table.php create mode 100644 database/migrations/2023_09_30_061600_updated_editor_enum_in_posts_table.php create mode 100644 dev.sh rename run_prod.sh => prod.sh (100%) delete mode 100644 public/build/assets/NativeImageBlock-610fd8da.js delete mode 100644 public/build/assets/NativeImageBlock-610fd8da.js.gz create mode 100644 public/build/assets/NativeImageBlock-78162560.js create mode 100644 public/build/assets/NativeImageBlock-78162560.js.gz rename public/build/assets/{PostEditor-8fdf28c3.js => PostEditor-5f10a2ff.js} (99%) create mode 100644 public/build/assets/PostEditor-5f10a2ff.js.gz delete mode 100644 public/build/assets/PostEditor-8fdf28c3.js.gz create mode 100644 public/build/assets/VueEditorJs-8bfa8291.js create mode 100644 public/build/assets/VueEditorJs-8bfa8291.js.gz delete mode 100644 public/build/assets/VueEditorJs-fc69dbb5.js delete mode 100644 public/build/assets/VueEditorJs-fc69dbb5.js.gz create mode 100644 public/build/assets/admin-app-62da08c5.js create mode 100644 public/build/assets/admin-app-62da08c5.js.gz create mode 100644 public/build/assets/admin-app-6630652e.css create mode 100644 public/build/assets/admin-app-6630652e.css.gz delete mode 100644 public/build/assets/admin-app-bade20ce.css delete mode 100644 public/build/assets/admin-app-bade20ce.css.gz delete mode 100644 public/build/assets/admin-app-c0ef582d.js delete mode 100644 public/build/assets/admin-app-c0ef582d.js.gz create mode 100644 public/build/assets/bootstrap-icons-4d4572ef.woff delete mode 100644 public/build/assets/bootstrap-icons-999550fa.woff create mode 100644 public/build/assets/bootstrap-icons-bacd70af.woff2 delete mode 100644 public/build/assets/bootstrap-icons-cfe45b98.woff2 rename public/build/assets/{bundle-dbffa4bb.js => bundle-13ffaba5.js} (99%) create mode 100644 public/build/assets/bundle-13ffaba5.js.gz delete mode 100644 public/build/assets/bundle-7b5ccf90.js.gz rename public/build/assets/{bundle-7b5ccf90.js => bundle-9b767e03.js} (99%) create mode 100644 public/build/assets/bundle-9b767e03.js.gz delete mode 100644 public/build/assets/bundle-dbffa4bb.js.gz delete mode 100644 public/build/assets/front-app-0cdc6a38.js.gz delete mode 100644 public/build/assets/front-app-1a35e3f2.css delete mode 100644 public/build/assets/front-app-1a35e3f2.css.gz rename public/build/assets/{front-app-0cdc6a38.js => front-app-b716c47a.js} (55%) create mode 100644 public/build/assets/front-app-b716c47a.js.gz create mode 100644 public/build/assets/front-app-f0b54e22.css create mode 100644 public/build/assets/front-app-f0b54e22.css.gz create mode 100644 routes/tests.php delete mode 100644 run_dev.sh diff --git a/.env.example b/.env.example index e61d98f..3f8ad4e 100644 --- a/.env.example +++ b/.env.example @@ -72,3 +72,6 @@ DEV_DEFAULT_LOCATION=MY DEV_DEFAULT_IP=202.188.193.93 INDEXNOW_KEY=xxxxxxxx-xxxx-xxxxx-xxxx-xxxxxxxxxx + +NODE_BINARY=/Users/xxx/.nvm/versions/node/v19.3.0/bin/node +NPM_BINARY=/Users/xxx/.nvm/versions/node/v19.3.0/bin/npm diff --git a/app/Exceptions/Handler.php b/app/Exceptions/Handler.php index 56af264..a6ed5fa 100644 --- a/app/Exceptions/Handler.php +++ b/app/Exceptions/Handler.php @@ -2,7 +2,9 @@ namespace App\Exceptions; +use Illuminate\Auth\AuthenticationException; use Illuminate\Foundation\Exceptions\Handler as ExceptionHandler; +use Symfony\Component\HttpKernel\Exception\NotFoundHttpException; use Throwable; class Handler extends ExceptionHandler @@ -20,11 +22,44 @@ class Handler extends ExceptionHandler /** * Register the exception handling callbacks for the application. + * + * @return void */ - public function register(): void + public function register() { - $this->reportable(function (Throwable $e) { + $this->reportable(function (Throwable $exception) { // + + }); + + $this->renderable(function (NotFoundHttpException $e, $request) { + if ($request->is('api/*')) { + return response()->json([ + 'status' => -1, + ], 404); + } }); } + + /** + * Render an exception into an HTTP response. + * + * @param \Illuminate\Http\Request $request + * @param \Exception $exception + * @return \Illuminate\Http\Response + */ + public function render($request, Throwable $exception) + { + + if ($exception instanceof NotFoundHttpException) { + + } elseif ($exception instanceof AuthenticationException) { + + } else { + inspector()->reportException($exception); + } + + //default laravel response + return parent::render($request, $exception); + } } diff --git a/app/Helpers/FirstParty/OSSUploader/OSSUploader.php b/app/Helpers/FirstParty/OSSUploader/OSSUploader.php new file mode 100644 index 0000000..d8a2183 --- /dev/null +++ b/app/Helpers/FirstParty/OSSUploader/OSSUploader.php @@ -0,0 +1,67 @@ +get($filepath); + + $decodedJson = json_decode($jsonContent, false, 512); + + return $decodedJson; + + } catch (\Exception $e) { + return null; + } + + return null; + } + + public static function readFile($storage_driver, $relative_directory, $filename) + { + $filepath = rtrim($relative_directory, '/').'/'.$filename; + + try { + return Storage::disk($storage_driver)->get($filepath); + + } catch (\Exception $e) { + return null; + } + + return null; + } + + public static function uploadJson($storage_driver, $relative_directory, $filename, $jsonData) + { + $jsonString = json_encode($jsonData, JSON_PRETTY_PRINT); + + try { + return self::uploadFile($storage_driver, $relative_directory, $filename, $jsonString); + } catch (Exception $e) { + return false; + } + + return false; + + } + + public static function uploadFile($storage_driver, $relative_directory, $filename, $file) + { + $filepath = rtrim($relative_directory, '/').'/'.$filename; + + // if(!Storage::disk($storage_driver)->exists($relative_directory)) + // { + // Storage::disk($storage_driver)->makeDirectory($relative_directory); + // } + + return Storage::disk($storage_driver)->put($filepath, $file); + } +} diff --git a/app/Helpers/FirstParty/OpenAI/OpenAI.php b/app/Helpers/FirstParty/OpenAI/OpenAI.php new file mode 100644 index 0000000..1e681a6 --- /dev/null +++ b/app/Helpers/FirstParty/OpenAI/OpenAI.php @@ -0,0 +1,83 @@ +reportException($e); + + return null; + } + + } + + return null; + + } + + public static function chatCompletion($system_prompt, $user_prompt, $model, $max_token = 2500) + { + try { + $response = Http::timeout(800)->withToken(config('platform.ai.openai.api_key')) + ->post('https://api.openai.com/v1/chat/completions', [ + 'model' => $model, + 'max_tokens' => $max_token, + 'messages' => [ + ['role' => 'system', 'content' => $system_prompt], + ['role' => 'user', 'content' => $user_prompt], + ], + ]); + + //dd($response->body()); + + $json_response = json_decode($response->body(), false, 512, JSON_THROW_ON_ERROR); + + $reply = $json_response?->choices[0]?->message?->content; + + return $reply; + } catch (Exception $e) { + Log::error($response->body()); + inspector()->reportException($e); + throw ($e); + } + + return null; + + } +} diff --git a/app/Helpers/Global/helpers.php b/app/Helpers/Global/helpers.php index 2fb05db..6666268 100644 --- a/app/Helpers/Global/helpers.php +++ b/app/Helpers/Global/helpers.php @@ -2,3 +2,4 @@ require 'string_helper.php'; require 'geo_helper.php'; +require 'proxy_helper.php'; diff --git a/app/Helpers/Global/proxy_helper.php b/app/Helpers/Global/proxy_helper.php new file mode 100644 index 0000000..cfba7f2 --- /dev/null +++ b/app/Helpers/Global/proxy_helper.php @@ -0,0 +1,33 @@ +slug($delimiter); + } +} + if (! function_exists('is_empty')) { /** * A better function to check if a value is empty or null. Strings, arrays, and Objects are supported. diff --git a/app/Http/Controllers/Tests/ScraperTestController.php b/app/Http/Controllers/Tests/ScraperTestController.php new file mode 100644 index 0000000..d6dbc05 --- /dev/null +++ b/app/Http/Controllers/Tests/ScraperTestController.php @@ -0,0 +1,31 @@ +input('seller'), 'MY', $category) + ->onQueue('default') + ->onConnection('default'); + } + + public function gen(Request $request) + { + $shopee_seller_scrape = ShopeeSellerScrape::find(6); + + $task = GenerateShopeeAIArticleTask::handle($shopee_seller_scrape); + + dd($task); + } +} diff --git a/app/Jobs/ShopeeSellerTopProductScraperJob.php b/app/Jobs/ShopeeSellerTopProductScraperJob.php new file mode 100644 index 0000000..e8ee0ba --- /dev/null +++ b/app/Jobs/ShopeeSellerTopProductScraperJob.php @@ -0,0 +1,55 @@ +seller = $seller; + + $this->country_iso = $country_iso; + + $this->category = $category; + } + + /** + * Execute the job. + */ + public function handle(): void + { + $shopee_task = ShopeeSellerTopProductScraperTask::handle($this->seller, $this->country_iso, $this->category); + + //dd($shopee_task->product_task); + + if (! is_null($shopee_task)) { + SaveShopeeSellerImagesTask::handle($shopee_task); + + GenerateShopeeAIArticleTask::handle($shopee_task->shopee_seller_scrape); + } + + } +} diff --git a/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php b/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php new file mode 100644 index 0000000..bfb5a09 --- /dev/null +++ b/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php @@ -0,0 +1,181 @@ +filename); + + $post = null; + + $shopee_seller_scrape->load('category'); + + if (! is_empty($serialised)) { + $shopee_task = unserialize($serialised); + $shopee_task->shopee_seller_scrape = $shopee_seller_scrape; + } + + // dd($shopee_task); + + // dd($shopee_task->product_task->response); + + $raw_html = $shopee_task->product_task->response->raw_html; + + $excerpt = self::stripHtml($raw_html); + + $photos = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', false)->orderByRaw('RAND()')->take(3)->get()->pluck('image')->toArray(); + + $ai_writeup = AiWriteup::where('source', 'shopee')->where('source_url', $shopee_task->product_task->response->url)->first(); + + if (is_null($ai_writeup)) { + $ai_output = OpenAI::writeProductArticle($excerpt, $photos); + + if (is_null($ai_output)) { + $e = new Exception('Failed to write: Missing ai_output'); + + Log::error(serialize($ai_writeup?->toArray())); + inspector()->reportException($e); + throw ($e); + } else { + // save + $ai_writeup = new AiWriteup; + $ai_writeup->source = 'shopee'; + $ai_writeup->source_url = $shopee_task->product_task->response->url; + $ai_writeup->category_id = $shopee_seller_scrape->category->id; + $ai_writeup->title = $ai_output->title; + $ai_writeup->excerpt = $ai_output->excerpt; + $ai_writeup->featured_image = ''; + $ai_writeup->body = $ai_output->body; + $ai_writeup->cost = self::getTotalServiceCost($shopee_task); + $ai_writeup->editor_format = 'markdown'; + + if ($ai_writeup->save()) { + $featured_photo = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', true)->first(); + + // new post + $post_data = [ + 'publish_date' => now(), + 'title' => $ai_writeup->title, + 'slug' => str_slug($ai_writeup->title), + 'excerpt' => $ai_writeup->excerpt, + 'cliffhanger' => $ai_writeup->cliffhanger, + 'author_id' => 1, + 'featured' => false, + 'featured_image' => $featured_photo->image, + 'editor' => 'markdown', + 'body' => $ai_writeup->body, + 'post_format' => 'standard', + 'status' => 'publish', + ]; + + $post = Post::create($post_data); + + if (! is_null($post)) { + PostCategory::create([ + 'post_id' => $post->id, + 'category_id' => $shopee_seller_scrape->category->id, + ]); + + if (app()->environment() == 'production') { + if ($post->status == 'publish') { + + $post_url = route('home.country.post', ['country' => $post->post_category?->category?->country_locale_slug, 'post_slug' => $post->slug]); + + LaravelGoogleIndexing::create()->update($post_url); + IndexNow::submit($post_url); + } + } + + } + } + } + } else { + $e = new Exception('Failed to write: ai_writeup found'); + Log::error(serialize($ai_writeup?->toArray())); + inspector()->reportException($e); + throw ($e); + } + + return $post; + } + + private static function getTotalServiceCost($shopee_task) + { + + $cost = 0.00; + + $cost += 0.06; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens + + // Shopee Seller Scraping + if (isset($shopee_task?->seller_shop_task?->response?->total_cost)) { + $cost += $shopee_task?->seller_shop_task?->response?->total_cost; + } + + // Shopee Product Scraping + if (isset($shopee_task?->product_task?->response?->total_cost)) { + $cost += $shopee_task?->product_task?->response?->total_cost; + } + + return $cost; + + } + + private static function stripHtml(string $raw_html) + { + $r_configuration = new ReadabilityConfiguration(); + $r_configuration->setWordThreshold(20); + + $readability = new Readability($r_configuration); + + // try { + // $readability->parse($raw_html); + + // $html_content = $readability->getContent(); + + // // Remove tabs + // $html_content = str_replace("\t", '', $html_content); + + // // Replace newlines with spaces + // $html_content = str_replace(["\n", "\r\n"], ' ', $html_content); + + // // Replace multiple spaces with a single space + // $html_content = preg_replace('/\s+/', ' ', $html_content); + + // // Output the cleaned text + // $html_content = trim($html_content); // Using trim to remove any leading or trailing spaces + + // $html_content = strip_tags($html_content); + + // } catch (ReadabilityParseException|Exception $e) { + + $html5 = new HTML5(['preserveWhiteSpace' => true]); + + // Parse the HTML into a DOM tree. + $dom = $html5->loadHTML($raw_html); + + // Serialize the DOM tree back to a string, formatted. + $html_content = strip_tags($html5->saveHTML($dom)); + + // } + + return $html_content; + } +} diff --git a/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php b/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php new file mode 100644 index 0000000..6af5080 --- /dev/null +++ b/app/Jobs/Tasks/SaveShopeeSellerImagesTask.php @@ -0,0 +1,355 @@ +product_task?->intervention?->main_intervention_image)) { + $main_intervention_image = $shopee_task->product_task->intervention->main_intervention_image; + } else { + $main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld); + } + + // If there is other image interventions set, then set in, else get the image urls only. + if (isset($shopee_task?->product_task?->intervention?->intervention_images)) { + $intervention_images = $shopee_task->product_task->intervention->intervention_images; + } else { + $images = self::getImages($shopee_task->product_task->response->raw_html); + $images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images); + } + + ///////// PART 2 + + // Check existence and upload if image intervention is set + if (! is_null($main_intervention_image)) { + $scraped_image = ShopeeSellerScrapedImage::where('original_name', $main_intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); + + if (is_null($scraped_image)) { + $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true); + } + } + // if there is no main image intervention but the main image url is provided + elseif (! is_empty($main_image_url)) { + $scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); + + if (is_null($scraped_image)) { + $main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image); + + $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true); + } + } + + /////// PART 3 + + if (! is_null($intervention_images) && is_array($intervention_images) && count($intervention_images) > 0) { + foreach ($intervention_images as $intervention_image) { + $scraped_image = ShopeeSellerScrapedImage::where('original_name', $intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first(); + + if (is_null($scraped_image)) { + $scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $intervention_image, false); + } + } + } + + //return ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->get(); + + } + + private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $intervention_image, $featured = false) + { + // Generate a unique filename for the uploaded file and LQIP version + $uuid = Str::uuid()->toString(); + $fileName = time().'_'.$uuid.'.jpg'; + $lqipFileName = time().'_'.$uuid.'_lqip.jpg'; + + // Convert the file to JPEG format using Intervention Image library + $image = $intervention_image->image; + + // Get the original image width and height + $originalWidth = $image->width(); + $originalHeight = $image->height(); + + // Compress the image to reduce file size to 50% + $image->encode('jpg', 50); + + // Save the processed image to the 'r2' storage driver under the 'uploads' directory + $filePath = 'uploads/'.$fileName; + $lqipFilePath = 'uploads/'.$lqipFileName; + Storage::disk('r2')->put($filePath, $image->stream()->detach()); + + // Save the original image to a temporary file and open it again + $tempImagePath = tempnam(sys_get_temp_dir(), 'temp_image'); + file_put_contents($tempImagePath, $intervention_image->image->encode()); + $clonedImage = Image::make($tempImagePath); + + // Create the LQIP version of the image using a small size while maintaining the aspect ratio + $lqipImage = $clonedImage->fit(10, 10, function ($constraint) { + $constraint->aspectRatio(); + }); + $lqipImage->encode('jpg', 5); + Storage::disk('r2')->put($lqipFilePath, $lqipImage->stream()->detach()); + + // Cleanup the temporary image file + unlink($tempImagePath); + + // Get the final URL of the uploaded image (non-LQIP version) + $url = Storage::disk('r2')->url($filePath); + + $scraped_image = new ShopeeSellerScrapedImage; + $scraped_image->shopee_seller_scrape_id = $shopee_seller_scrape->id; + $scraped_image->original_name = $intervention_image->original_name; + $scraped_image->image = $url; + $scraped_image->featured = $featured; + + if ($scraped_image->save()) { + return $scraped_image; + } + + return null; + } + + private static function getImages(string $raw_html) + { + $crawler = new Crawler($raw_html); + $images = []; + + $crawler->filter('img')->each(function ($node) use (&$images) { + $src = $node->attr('src'); + $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present + $images[] = [ + 'src' => $src, + 'alt' => $alt, + ]; + }); + + // if (count($images) > 4) + // { + // return $images; + // } + + return $images; + } + + private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images) + { + $filteredImages = []; + $uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations + + $count = 0; + + foreach ($images as $image) { + $count++; + + $src = $image['src']; + + try { + $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src); + + // Check if the request was successful + if (! $response->successful()) { + continue; + } + + $imageData = $response->body(); + + // Create an Intervention Image instance from the response data + $interventionImage = Image::make($imageData); + + $width = $interventionImage->width(); + $height = $interventionImage->height(); + $mime = $interventionImage->mime(); + + // Image size in KB + $sizeKb = round(strlen($imageData) / 1024, 2); + + // Check constraints + if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') { + continue; + } + $image['width'] = $width; + $image['height'] = $height; + $image['mime'] = $mime; + $image['sizeKb'] = $sizeKb; + + // Check for duplicates by searching through uniqueAttributes + $isDuplicate = false; + foreach ($uniqueAttributes as $attr) { + if ( + $attr['width'] == $width && + $attr['height'] == $height && + $attr['mime'] == $mime && + abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance + ) { + $isDuplicate = true; + break; + } + } + + if (! $isDuplicate) { + $uniqueAttributes[] = [ + 'width' => $width, + 'height' => $height, + 'mime' => $mime, + 'sizeKb' => $sizeKb, + ]; + $image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage); + //$image['img'] = $interventionImage; + $costs['count-'.$count] = calculate_smartproxy_cost($sizeKb); + + $filteredImages[] = $image; + + $intervention_images[] = (object) [ + 'image' => $interventionImage, + 'original_name' => pathinfo($src, PATHINFO_BASENAME), + ]; + } + } catch (\Exception $e) { + // Handle exceptions related to the HTTP request + continue; + } + } + + // Collect all the color counts + $colorCounts = []; + foreach ($filteredImages as $image) { + $colorCounts[] = $image['color_counts']; + } + + // Compute the median of the color counts + sort($colorCounts); + $count = count($colorCounts); + $middleIndex = floor($count / 2); + $median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex]; + + // Use the median to filter out the low outliers + $threshold = 0.10 * $median; // Adjust this percentage as needed + $filteredImages = array_filter($filteredImages, function ($image) use ($threshold) { + return $image['color_counts'] > $threshold; + }); + + usort($filteredImages, function ($a, $b) { + return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order + }); + + return $filteredImages; + } + + private static function getProductImageUrl(array $jsonLdData) + { + foreach ($jsonLdData as $data) { + // Ensure the type is "Product" before proceeding + if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { + if (isset($data->url)) { + return $data->url; + } + } + } + } + + private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image) + { + foreach ($jsonLdData as $data) { + // Ensure the type is "Product" before proceeding + if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { + if (isset($data->url) && isset($data->image)) { + try { + $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image); + + // Check if the request was successful + if ($response->successful()) { + $imageData = $response->body(); + + // Create an Intervention Image instance from the response data + $interventionImage = Image::make($imageData); + + // Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed + $interventionImage->fit(1920, 1080, function ($constraint) { + $constraint->upsize(); + $constraint->aspectRatio(); + }); + + $sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes + + // Calculate the cost + $cost = calculate_smartproxy_cost($sizeInKb); + + $costs['product_image'] = $cost; + + $main_intervention_image = (object) [ + 'image' => $interventionImage, + 'original_name' => pathinfo($data->image, PATHINFO_BASENAME), + ]; + + return [ + 'url' => $data->url, + //'img' => $interventionImage, + 'cost' => $cost, + ]; + } + } catch (\Exception $e) { + // Handle exceptions related to the HTTP request + return null; + } + } + } + } + + return null; + } + + private static function isMostlyTextBasedOnUniqueColors($interventionImage) + { + // Use Intervention to manipulate the image + $img = clone $interventionImage; + + // Resize to a smaller dimension for faster processing (maintaining aspect ratio) + $img->resize(200, null, function ($constraint) { + $constraint->aspectRatio(); + }); + + // Apply some blur + $img->blur(10); + + $im = imagecreatefromstring($img->encode()); + + $width = imagesx($im); + $height = imagesy($im); + + $uniqueColors = []; + + for ($x = 0; $x < $width; $x++) { + for ($y = 0; $y < $height; $y++) { + $rgb = imagecolorat($im, $x, $y); + $uniqueColors[$rgb] = true; + } + } + + imagedestroy($im); + + // Adjust the threshold based on your dataset. + // Here, I'm assuming that images with less than 100 unique colors are mostly text + // because we've reduced the image size and applied blurring. + return count($uniqueColors); + } +} diff --git a/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php b/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php new file mode 100644 index 0000000..eb5f6c9 --- /dev/null +++ b/app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php @@ -0,0 +1,133 @@ +where('country_iso', $country_iso)->first(); + + if (! is_null($shopee_seller_scrape)) { + $serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename); + + if (! is_empty($serialised)) { + $obj = unserialize($serialised); + $obj->shopee_seller_scrape = $shopee_seller_scrape; + + return $obj; + } + } + + $epoch = epoch_now_timestamp(); + + $seller_shop_url = "https://shopee.com.my/{$seller}?page=0&sortBy=sales"; + + $seller_shop_task = UrlCrawlerTask::handle($seller_shop_url, 'shopee/seller', $epoch, true, false); + + //dd($seller_shop_task); + + if (isset($seller_shop_task->response->jsonld)) { + $top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 100); + + if (count($top_rank_products) > 0) { + + $product_found = null; + + foreach ($top_rank_products as $product) { + $product_task = UrlCrawlerTask::handle($product->url, 'shopee/seller', $epoch, true, true); + + if ($product_task->response->status_code >= 0) { + $product_found = $product_task->response; + break; + } + } + + $scraped = (object) [ + 'seller_shop_task' => (object) [ + 'response' => $seller_shop_task->response, + ], + 'product_task' => (object) [ + 'response' => $product_task->response, + ], + ]; + + $serialised = serialize($scraped); + + $filename = $seller.'-'.$epoch.'-'.$country_iso.'.txt'; + + OSSUploader::uploadFile('r2', 'shopee/seller', $filename, $serialised); + + $shopee_seller_scrape = new ShopeeSellerScrape; + $shopee_seller_scrape->seller = $seller; + $shopee_seller_scrape->country_iso = $country_iso; + $shopee_seller_scrape->epoch = $epoch; + $shopee_seller_scrape->filename = $filename; + $shopee_seller_scrape->category_id = $category->id; + + if ($shopee_seller_scrape->save()) { + return (object) compact('seller_shop_task', 'product_task', 'shopee_seller_scrape'); + } + + } + } + + return null; + } + + private static function getSortedData($data, $minValue) + { + // Filter the items of type "Product" with an offer price greater than 200 + $filtered = array_filter($data, function ($item) use ($minValue) { + $isProduct = $item->{'@type'} === 'Product'; + $lowPrice = floatval($item->offers?->lowPrice ?? 0); + $price = floatval($item->offers?->price ?? 0); + + return $isProduct && ($lowPrice > $minValue) || ($price > $minValue); + }); + + // Sort the items based on `ratingCount` and `ratingValue` in descending order + usort($filtered, function ($a, $b) { + $ratingCountA = intval($a->aggregateRating?->ratingCount ?? 0); + $ratingCountB = intval($b->aggregateRating?->ratingCount ?? 0); + + $ratingValueA = floatval($a->aggregateRating?->ratingValue ?? 0); + $ratingValueB = floatval($b->aggregateRating?->ratingValue ?? 0); + + if ($ratingCountA !== $ratingCountB) { + return $ratingCountB - $ratingCountA; + } + + return $ratingValueB <=> $ratingValueA; + }); + + // Map the filtered and sorted items to a new array of objects + return array_map(function ($item) { + return (object) [ + 'name' => $item->name ?? null, + 'description' => $item->description ?? null, + 'url' => $item->url ?? null, + 'image' => $item->image ?? null, + 'lowPrice' => floatval($item->offers?->lowPrice ?? 0), + 'highPrice' => floatval($item->offers?->highPrice ?? 0), + 'price' => floatval($item->offers?->price ?? 0), + 'priceCurrency' => $item->offers?->priceCurrency ?? null, + 'ratingCount' => intval($item->aggregateRating?->ratingCount ?? 0), + 'ratingValue' => floatval($item->aggregateRating?->ratingValue ?? 0), + ]; + }, $filtered); + } +} diff --git a/app/Jobs/Tasks/UrlCrawlerTask.php b/app/Jobs/Tasks/UrlCrawlerTask.php new file mode 100644 index 0000000..ffc7dbb --- /dev/null +++ b/app/Jobs/Tasks/UrlCrawlerTask.php @@ -0,0 +1,448 @@ +setUrl($cached_url) + ->setOption('args', ['headless: "new"']) + ->noSandbox() + ->setOption('args', ['--disable-web-security']) + ->userAgent($user_agent) + ->ignoreHttpsErrors() + ->preventUnsuccessfulResponse() + ->timeout(10) + //->setProxyServer($proxy_server) + ->userAgent($user_agent); + + if (app()->environment() == 'local') { + $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary')); + } + + //dump($browsershot); + + $raw_html = $browsershot->bodyHtml(); + + // $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes + // $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ; + + // $costs['html'] = $browsershot_cost; + + } catch (UnsuccessfulResponse|Exception $e) { + $raw_html = null; + $status_code = -3; + } + + if (! is_empty($raw_html)) { + OSSUploader::uploadFile($driver, $directory, $filename, $raw_html); + $status_code = 1; + } + } + + if (! is_null($raw_html)) { + + $raw_html = self::minifyAndCleanHtml($raw_html); + + $jsonld = self::getJsonLd($raw_html); + + if ($parse_images) { + $images = self::getImages($raw_html); + $images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images); + } else { + $images = []; + } + + $main_image = self::getProductImage($jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image); + + return (object) [ + 'intervention' => (object) compact('main_intervention_image', 'intervention_images'), + 'response' => (object) [ + 'url' => $url, + 'postfix' => $postfix, + 'filename' => $disk_url, + 'raw_html' => $raw_html, + 'jsonld' => $jsonld, + 'main_image' => $main_image, + 'images' => $images, + 'status_code' => $status_code, + 'costs' => $costs, + 'total_cost' => array_sum(array_values($costs)), + ], + ]; + } + + return (object) [ + 'response' => (object) [ + 'url' => $url, + 'postfix' => $postfix, + 'filename' => null, + 'raw_html' => null, + 'jsonld' => [], + 'main_image' => null, + 'images' => [], + 'status_code' => $status_code, + 'costs' => $costs, + 'total_cost' => 0, + ], + ]; + } + + private static function getJsonLd(string $raw_html) + { + $crawler = new Crawler($raw_html); + + try { + $jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) { + return $node->text(); + }); + } catch (Exception $e) { + return []; + } + + $contents = []; + + foreach ($jsonld as $content) { + try { + $contents[] = json_decode($content); + } catch (Exception $e) { + } + } + + return $contents; + } + + private static function getImages(string $raw_html) + { + $crawler = new Crawler($raw_html); + $images = []; + + $crawler->filter('img')->each(function ($node) use (&$images) { + $src = $node->attr('src'); + $alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present + $images[] = [ + 'src' => $src, + 'alt' => $alt, + ]; + }); + + // if (count($images) > 4) + // { + // return $images; + // } + + return $images; + } + + private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images) + { + $filteredImages = []; + $uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations + + $count = 0; + + foreach ($images as $image) { + $count++; + + $src = $image['src']; + + try { + $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src); + + // Check if the request was successful + if (! $response->successful()) { + continue; + } + + $imageData = $response->body(); + + // Create an Intervention Image instance from the response data + $interventionImage = Image::make($imageData); + + $width = $interventionImage->width(); + $height = $interventionImage->height(); + $mime = $interventionImage->mime(); + + // Image size in KB + $sizeKb = round(strlen($imageData) / 1024, 2); + + // Check constraints + if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') { + continue; + } + $image['width'] = $width; + $image['height'] = $height; + $image['mime'] = $mime; + $image['sizeKb'] = $sizeKb; + + // Check for duplicates by searching through uniqueAttributes + $isDuplicate = false; + foreach ($uniqueAttributes as $attr) { + if ( + $attr['width'] == $width && + $attr['height'] == $height && + $attr['mime'] == $mime && + abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance + ) { + $isDuplicate = true; + break; + } + } + + if (! $isDuplicate) { + $uniqueAttributes[] = [ + 'width' => $width, + 'height' => $height, + 'mime' => $mime, + 'sizeKb' => $sizeKb, + ]; + $image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage); + //$image['img'] = $interventionImage; + $costs['count-'.$count] = calculate_smartproxy_cost($sizeKb); + + $filteredImages[] = $image; + + $intervention_images[] = (object) [ + 'image' => $interventionImage, + 'original_name' => pathinfo($src, PATHINFO_BASENAME), + ]; + } + } catch (\Exception $e) { + // Handle exceptions related to the HTTP request + continue; + } + } + + // Collect all the color counts + $colorCounts = []; + foreach ($filteredImages as $image) { + $colorCounts[] = $image['color_counts']; + } + + // Compute the median of the color counts + sort($colorCounts); + $count = count($colorCounts); + $middleIndex = floor($count / 2); + $median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex]; + + // Use the median to filter out the low outliers + $threshold = 0.10 * $median; // Adjust this percentage as needed + $filteredImages = array_filter($filteredImages, function ($image) use ($threshold) { + return $image['color_counts'] > $threshold; + }); + + usort($filteredImages, function ($a, $b) { + return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order + }); + + return $filteredImages; + } + + // private static function isImageMostlyText($imageData, $mime) { + // try { + // $text = (new TesseractOCR)->imageData($imageData, $mime)->run(); + // $textLength = strlen($text); + + // // This is a basic check. Adjust the threshold as needed. + // return $textLength > 50; + // } catch (\Exception $e) { + // // Handle any exceptions related to Tesseract OCR + // return false; + // } + // } + + private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image) + { + foreach ($jsonLdData as $data) { + // Ensure the type is "Product" before proceeding + if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') { + if (isset($data->url) && isset($data->image)) { + try { + $response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image); + + // Check if the request was successful + if ($response->successful()) { + $imageData = $response->body(); + + // Create an Intervention Image instance from the response data + $interventionImage = Image::make($imageData); + + // Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed + $interventionImage->fit(1920, 1080, function ($constraint) { + $constraint->upsize(); + $constraint->aspectRatio(); + }); + + $sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes + + // Calculate the cost + $cost = calculate_smartproxy_cost($sizeInKb); + + $costs['product_image'] = $cost; + + $main_intervention_image = (object) [ + 'image' => $interventionImage, + 'original_name' => pathinfo($data->image, PATHINFO_BASENAME), + ]; + + return [ + 'url' => $data->url, + //'img' => $interventionImage, + 'cost' => $cost, + ]; + } + } catch (\Exception $e) { + // Handle exceptions related to the HTTP request + return null; + } + } + } + } + + return null; + } + + private static function isMostlyTextBasedOnUniqueColors($interventionImage) + { + // Use Intervention to manipulate the image + $img = clone $interventionImage; + + // Resize to a smaller dimension for faster processing (maintaining aspect ratio) + $img->resize(200, null, function ($constraint) { + $constraint->aspectRatio(); + }); + + // Apply some blur + $img->blur(10); + + $im = imagecreatefromstring($img->encode()); + + $width = imagesx($im); + $height = imagesy($im); + + $uniqueColors = []; + + for ($x = 0; $x < $width; $x++) { + for ($y = 0; $y < $height; $y++) { + $rgb = imagecolorat($im, $x, $y); + $uniqueColors[$rgb] = true; + } + } + + imagedestroy($im); + + // Adjust the threshold based on your dataset. + // Here, I'm assuming that images with less than 100 unique colors are mostly text + // because we've reduced the image size and applied blurring. + return count($uniqueColors); + } + + private static function minifyAndCleanHtml(string $raw_html) + { + $raw_html = TinyMinify::html($raw_html); + + $crawler = new Crawler($raw_html); + + // Directly loop through the DOM and remove 'class' and 'id' attributes + foreach ($crawler as $domElement) { + /** @var \DOMNodeList $nodes */ + $nodes = $domElement->getElementsByTagName('*'); + foreach ($nodes as $node) { + /** @var \DOMElement $node */ + $node->removeAttribute('class'); + $node->removeAttribute('id'); + $node->removeAttribute('style'); + } + } + + // Remove