Add (ai gen)
This commit is contained in:
181
app/Jobs/Tasks/GenerateShopeeAIArticleTask.php
Normal file
181
app/Jobs/Tasks/GenerateShopeeAIArticleTask.php
Normal file
@@ -0,0 +1,181 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use andreskrey\Readability\Configuration as ReadabilityConfiguration;
|
||||
use andreskrey\Readability\ParseException as ReadabilityParseException;
|
||||
use andreskrey\Readability\Readability;
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Models\AiWriteup;
|
||||
use App\Models\Post;
|
||||
use App\Models\PostCategory;
|
||||
use App\Models\ShopeeSellerScrape;
|
||||
use App\Models\ShopeeSellerScrapedImage;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use LaravelFreelancerNL\LaravelIndexNow\Facades\IndexNow;
|
||||
use LaravelGoogleIndexing;
|
||||
use Masterminds\HTML5;
|
||||
|
||||
class GenerateShopeeAIArticleTask
|
||||
{
|
||||
public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
|
||||
{
|
||||
$serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename);
|
||||
|
||||
$post = null;
|
||||
|
||||
$shopee_seller_scrape->load('category');
|
||||
|
||||
if (! is_empty($serialised)) {
|
||||
$shopee_task = unserialize($serialised);
|
||||
$shopee_task->shopee_seller_scrape = $shopee_seller_scrape;
|
||||
}
|
||||
|
||||
// dd($shopee_task);
|
||||
|
||||
// dd($shopee_task->product_task->response);
|
||||
|
||||
$raw_html = $shopee_task->product_task->response->raw_html;
|
||||
|
||||
$excerpt = self::stripHtml($raw_html);
|
||||
|
||||
$photos = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', false)->orderByRaw('RAND()')->take(3)->get()->pluck('image')->toArray();
|
||||
|
||||
$ai_writeup = AiWriteup::where('source', 'shopee')->where('source_url', $shopee_task->product_task->response->url)->first();
|
||||
|
||||
if (is_null($ai_writeup)) {
|
||||
$ai_output = OpenAI::writeProductArticle($excerpt, $photos);
|
||||
|
||||
if (is_null($ai_output)) {
|
||||
$e = new Exception('Failed to write: Missing ai_output');
|
||||
|
||||
Log::error(serialize($ai_writeup?->toArray()));
|
||||
inspector()->reportException($e);
|
||||
throw ($e);
|
||||
} else {
|
||||
// save
|
||||
$ai_writeup = new AiWriteup;
|
||||
$ai_writeup->source = 'shopee';
|
||||
$ai_writeup->source_url = $shopee_task->product_task->response->url;
|
||||
$ai_writeup->category_id = $shopee_seller_scrape->category->id;
|
||||
$ai_writeup->title = $ai_output->title;
|
||||
$ai_writeup->excerpt = $ai_output->excerpt;
|
||||
$ai_writeup->featured_image = '';
|
||||
$ai_writeup->body = $ai_output->body;
|
||||
$ai_writeup->cost = self::getTotalServiceCost($shopee_task);
|
||||
$ai_writeup->editor_format = 'markdown';
|
||||
|
||||
if ($ai_writeup->save()) {
|
||||
$featured_photo = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', true)->first();
|
||||
|
||||
// new post
|
||||
$post_data = [
|
||||
'publish_date' => now(),
|
||||
'title' => $ai_writeup->title,
|
||||
'slug' => str_slug($ai_writeup->title),
|
||||
'excerpt' => $ai_writeup->excerpt,
|
||||
'cliffhanger' => $ai_writeup->cliffhanger,
|
||||
'author_id' => 1,
|
||||
'featured' => false,
|
||||
'featured_image' => $featured_photo->image,
|
||||
'editor' => 'markdown',
|
||||
'body' => $ai_writeup->body,
|
||||
'post_format' => 'standard',
|
||||
'status' => 'publish',
|
||||
];
|
||||
|
||||
$post = Post::create($post_data);
|
||||
|
||||
if (! is_null($post)) {
|
||||
PostCategory::create([
|
||||
'post_id' => $post->id,
|
||||
'category_id' => $shopee_seller_scrape->category->id,
|
||||
]);
|
||||
|
||||
if (app()->environment() == 'production') {
|
||||
if ($post->status == 'publish') {
|
||||
|
||||
$post_url = route('home.country.post', ['country' => $post->post_category?->category?->country_locale_slug, 'post_slug' => $post->slug]);
|
||||
|
||||
LaravelGoogleIndexing::create()->update($post_url);
|
||||
IndexNow::submit($post_url);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$e = new Exception('Failed to write: ai_writeup found');
|
||||
Log::error(serialize($ai_writeup?->toArray()));
|
||||
inspector()->reportException($e);
|
||||
throw ($e);
|
||||
}
|
||||
|
||||
return $post;
|
||||
}
|
||||
|
||||
private static function getTotalServiceCost($shopee_task)
|
||||
{
|
||||
|
||||
$cost = 0.00;
|
||||
|
||||
$cost += 0.06; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens
|
||||
|
||||
// Shopee Seller Scraping
|
||||
if (isset($shopee_task?->seller_shop_task?->response?->total_cost)) {
|
||||
$cost += $shopee_task?->seller_shop_task?->response?->total_cost;
|
||||
}
|
||||
|
||||
// Shopee Product Scraping
|
||||
if (isset($shopee_task?->product_task?->response?->total_cost)) {
|
||||
$cost += $shopee_task?->product_task?->response?->total_cost;
|
||||
}
|
||||
|
||||
return $cost;
|
||||
|
||||
}
|
||||
|
||||
private static function stripHtml(string $raw_html)
|
||||
{
|
||||
$r_configuration = new ReadabilityConfiguration();
|
||||
$r_configuration->setWordThreshold(20);
|
||||
|
||||
$readability = new Readability($r_configuration);
|
||||
|
||||
// try {
|
||||
// $readability->parse($raw_html);
|
||||
|
||||
// $html_content = $readability->getContent();
|
||||
|
||||
// // Remove tabs
|
||||
// $html_content = str_replace("\t", '', $html_content);
|
||||
|
||||
// // Replace newlines with spaces
|
||||
// $html_content = str_replace(["\n", "\r\n"], ' ', $html_content);
|
||||
|
||||
// // Replace multiple spaces with a single space
|
||||
// $html_content = preg_replace('/\s+/', ' ', $html_content);
|
||||
|
||||
// // Output the cleaned text
|
||||
// $html_content = trim($html_content); // Using trim to remove any leading or trailing spaces
|
||||
|
||||
// $html_content = strip_tags($html_content);
|
||||
|
||||
// } catch (ReadabilityParseException|Exception $e) {
|
||||
|
||||
$html5 = new HTML5(['preserveWhiteSpace' => true]);
|
||||
|
||||
// Parse the HTML into a DOM tree.
|
||||
$dom = $html5->loadHTML($raw_html);
|
||||
|
||||
// Serialize the DOM tree back to a string, formatted.
|
||||
$html_content = strip_tags($html5->saveHTML($dom));
|
||||
|
||||
// }
|
||||
|
||||
return $html_content;
|
||||
}
|
||||
}
|
||||
355
app/Jobs/Tasks/SaveShopeeSellerImagesTask.php
Normal file
355
app/Jobs/Tasks/SaveShopeeSellerImagesTask.php
Normal file
@@ -0,0 +1,355 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Models\ShopeeSellerScrapedImage;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Illuminate\Support\Facades\Storage;
|
||||
use Illuminate\Support\Str;
|
||||
use Intervention\Image\Facades\Image;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class SaveShopeeSellerImagesTask
|
||||
{
|
||||
public static function handle($shopee_task)
|
||||
{
|
||||
$main_intervention_image = null;
|
||||
$intervention_images = [];
|
||||
$costs = [];
|
||||
|
||||
$main_image_url = null;
|
||||
|
||||
$proxy_server = get_smartproxy_server();
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
|
||||
///////// PART 1
|
||||
|
||||
// If there is a main intervention image, then set in, else get the url only.
|
||||
if (isset($shopee_task?->product_task?->intervention?->main_intervention_image)) {
|
||||
$main_intervention_image = $shopee_task->product_task->intervention->main_intervention_image;
|
||||
} else {
|
||||
$main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld);
|
||||
}
|
||||
|
||||
// If there is other image interventions set, then set in, else get the image urls only.
|
||||
if (isset($shopee_task?->product_task?->intervention?->intervention_images)) {
|
||||
$intervention_images = $shopee_task->product_task->intervention->intervention_images;
|
||||
} else {
|
||||
$images = self::getImages($shopee_task->product_task->response->raw_html);
|
||||
$images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images);
|
||||
}
|
||||
|
||||
///////// PART 2
|
||||
|
||||
// Check existence and upload if image intervention is set
|
||||
if (! is_null($main_intervention_image)) {
|
||||
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $main_intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
|
||||
|
||||
if (is_null($scraped_image)) {
|
||||
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true);
|
||||
}
|
||||
}
|
||||
// if there is no main image intervention but the main image url is provided
|
||||
elseif (! is_empty($main_image_url)) {
|
||||
$scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
|
||||
|
||||
if (is_null($scraped_image)) {
|
||||
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image);
|
||||
|
||||
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true);
|
||||
}
|
||||
}
|
||||
|
||||
/////// PART 3
|
||||
|
||||
if (! is_null($intervention_images) && is_array($intervention_images) && count($intervention_images) > 0) {
|
||||
foreach ($intervention_images as $intervention_image) {
|
||||
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
|
||||
|
||||
if (is_null($scraped_image)) {
|
||||
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $intervention_image, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//return ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->get();
|
||||
|
||||
}
|
||||
|
||||
private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $intervention_image, $featured = false)
|
||||
{
|
||||
// Generate a unique filename for the uploaded file and LQIP version
|
||||
$uuid = Str::uuid()->toString();
|
||||
$fileName = time().'_'.$uuid.'.jpg';
|
||||
$lqipFileName = time().'_'.$uuid.'_lqip.jpg';
|
||||
|
||||
// Convert the file to JPEG format using Intervention Image library
|
||||
$image = $intervention_image->image;
|
||||
|
||||
// Get the original image width and height
|
||||
$originalWidth = $image->width();
|
||||
$originalHeight = $image->height();
|
||||
|
||||
// Compress the image to reduce file size to 50%
|
||||
$image->encode('jpg', 50);
|
||||
|
||||
// Save the processed image to the 'r2' storage driver under the 'uploads' directory
|
||||
$filePath = 'uploads/'.$fileName;
|
||||
$lqipFilePath = 'uploads/'.$lqipFileName;
|
||||
Storage::disk('r2')->put($filePath, $image->stream()->detach());
|
||||
|
||||
// Save the original image to a temporary file and open it again
|
||||
$tempImagePath = tempnam(sys_get_temp_dir(), 'temp_image');
|
||||
file_put_contents($tempImagePath, $intervention_image->image->encode());
|
||||
$clonedImage = Image::make($tempImagePath);
|
||||
|
||||
// Create the LQIP version of the image using a small size while maintaining the aspect ratio
|
||||
$lqipImage = $clonedImage->fit(10, 10, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
$lqipImage->encode('jpg', 5);
|
||||
Storage::disk('r2')->put($lqipFilePath, $lqipImage->stream()->detach());
|
||||
|
||||
// Cleanup the temporary image file
|
||||
unlink($tempImagePath);
|
||||
|
||||
// Get the final URL of the uploaded image (non-LQIP version)
|
||||
$url = Storage::disk('r2')->url($filePath);
|
||||
|
||||
$scraped_image = new ShopeeSellerScrapedImage;
|
||||
$scraped_image->shopee_seller_scrape_id = $shopee_seller_scrape->id;
|
||||
$scraped_image->original_name = $intervention_image->original_name;
|
||||
$scraped_image->image = $url;
|
||||
$scraped_image->featured = $featured;
|
||||
|
||||
if ($scraped_image->save()) {
|
||||
return $scraped_image;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getImages(string $raw_html)
|
||||
{
|
||||
$crawler = new Crawler($raw_html);
|
||||
$images = [];
|
||||
|
||||
$crawler->filter('img')->each(function ($node) use (&$images) {
|
||||
$src = $node->attr('src');
|
||||
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
|
||||
$images[] = [
|
||||
'src' => $src,
|
||||
'alt' => $alt,
|
||||
];
|
||||
});
|
||||
|
||||
// if (count($images) > 4)
|
||||
// {
|
||||
// return $images;
|
||||
// }
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images)
|
||||
{
|
||||
$filteredImages = [];
|
||||
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
|
||||
|
||||
$count = 0;
|
||||
|
||||
foreach ($images as $image) {
|
||||
$count++;
|
||||
|
||||
$src = $image['src'];
|
||||
|
||||
try {
|
||||
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src);
|
||||
|
||||
// Check if the request was successful
|
||||
if (! $response->successful()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$imageData = $response->body();
|
||||
|
||||
// Create an Intervention Image instance from the response data
|
||||
$interventionImage = Image::make($imageData);
|
||||
|
||||
$width = $interventionImage->width();
|
||||
$height = $interventionImage->height();
|
||||
$mime = $interventionImage->mime();
|
||||
|
||||
// Image size in KB
|
||||
$sizeKb = round(strlen($imageData) / 1024, 2);
|
||||
|
||||
// Check constraints
|
||||
if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') {
|
||||
continue;
|
||||
}
|
||||
$image['width'] = $width;
|
||||
$image['height'] = $height;
|
||||
$image['mime'] = $mime;
|
||||
$image['sizeKb'] = $sizeKb;
|
||||
|
||||
// Check for duplicates by searching through uniqueAttributes
|
||||
$isDuplicate = false;
|
||||
foreach ($uniqueAttributes as $attr) {
|
||||
if (
|
||||
$attr['width'] == $width &&
|
||||
$attr['height'] == $height &&
|
||||
$attr['mime'] == $mime &&
|
||||
abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance
|
||||
) {
|
||||
$isDuplicate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (! $isDuplicate) {
|
||||
$uniqueAttributes[] = [
|
||||
'width' => $width,
|
||||
'height' => $height,
|
||||
'mime' => $mime,
|
||||
'sizeKb' => $sizeKb,
|
||||
];
|
||||
$image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage);
|
||||
//$image['img'] = $interventionImage;
|
||||
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb);
|
||||
|
||||
$filteredImages[] = $image;
|
||||
|
||||
$intervention_images[] = (object) [
|
||||
'image' => $interventionImage,
|
||||
'original_name' => pathinfo($src, PATHINFO_BASENAME),
|
||||
];
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// Handle exceptions related to the HTTP request
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all the color counts
|
||||
$colorCounts = [];
|
||||
foreach ($filteredImages as $image) {
|
||||
$colorCounts[] = $image['color_counts'];
|
||||
}
|
||||
|
||||
// Compute the median of the color counts
|
||||
sort($colorCounts);
|
||||
$count = count($colorCounts);
|
||||
$middleIndex = floor($count / 2);
|
||||
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
|
||||
|
||||
// Use the median to filter out the low outliers
|
||||
$threshold = 0.10 * $median; // Adjust this percentage as needed
|
||||
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
|
||||
return $image['color_counts'] > $threshold;
|
||||
});
|
||||
|
||||
usort($filteredImages, function ($a, $b) {
|
||||
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
|
||||
});
|
||||
|
||||
return $filteredImages;
|
||||
}
|
||||
|
||||
private static function getProductImageUrl(array $jsonLdData)
|
||||
{
|
||||
foreach ($jsonLdData as $data) {
|
||||
// Ensure the type is "Product" before proceeding
|
||||
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
||||
if (isset($data->url)) {
|
||||
return $data->url;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image)
|
||||
{
|
||||
foreach ($jsonLdData as $data) {
|
||||
// Ensure the type is "Product" before proceeding
|
||||
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
||||
if (isset($data->url) && isset($data->image)) {
|
||||
try {
|
||||
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
|
||||
|
||||
// Check if the request was successful
|
||||
if ($response->successful()) {
|
||||
$imageData = $response->body();
|
||||
|
||||
// Create an Intervention Image instance from the response data
|
||||
$interventionImage = Image::make($imageData);
|
||||
|
||||
// Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed
|
||||
$interventionImage->fit(1920, 1080, function ($constraint) {
|
||||
$constraint->upsize();
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
|
||||
$sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes
|
||||
|
||||
// Calculate the cost
|
||||
$cost = calculate_smartproxy_cost($sizeInKb);
|
||||
|
||||
$costs['product_image'] = $cost;
|
||||
|
||||
$main_intervention_image = (object) [
|
||||
'image' => $interventionImage,
|
||||
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
|
||||
];
|
||||
|
||||
return [
|
||||
'url' => $data->url,
|
||||
//'img' => $interventionImage,
|
||||
'cost' => $cost,
|
||||
];
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// Handle exceptions related to the HTTP request
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function isMostlyTextBasedOnUniqueColors($interventionImage)
|
||||
{
|
||||
// Use Intervention to manipulate the image
|
||||
$img = clone $interventionImage;
|
||||
|
||||
// Resize to a smaller dimension for faster processing (maintaining aspect ratio)
|
||||
$img->resize(200, null, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
|
||||
// Apply some blur
|
||||
$img->blur(10);
|
||||
|
||||
$im = imagecreatefromstring($img->encode());
|
||||
|
||||
$width = imagesx($im);
|
||||
$height = imagesy($im);
|
||||
|
||||
$uniqueColors = [];
|
||||
|
||||
for ($x = 0; $x < $width; $x++) {
|
||||
for ($y = 0; $y < $height; $y++) {
|
||||
$rgb = imagecolorat($im, $x, $y);
|
||||
$uniqueColors[$rgb] = true;
|
||||
}
|
||||
}
|
||||
|
||||
imagedestroy($im);
|
||||
|
||||
// Adjust the threshold based on your dataset.
|
||||
// Here, I'm assuming that images with less than 100 unique colors are mostly text
|
||||
// because we've reduced the image size and applied blurring.
|
||||
return count($uniqueColors);
|
||||
}
|
||||
}
|
||||
133
app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php
Normal file
133
app/Jobs/Tasks/ShopeeSellerTopProductScraperTask.php
Normal file
@@ -0,0 +1,133 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Models\Category;
|
||||
use App\Models\ShopeeSellerScrape;
|
||||
use Exception;
|
||||
|
||||
class ShopeeSellerTopProductScraperTask
|
||||
{
|
||||
public static function handle(string $seller, string $country_iso, Category $category)
|
||||
{
|
||||
|
||||
$country_iso = strtolower($country_iso);
|
||||
|
||||
if (is_empty($seller)) {
|
||||
throw new Exception('Missing \'seller\' attribute.');
|
||||
}
|
||||
|
||||
$shopee_seller_scrape = ShopeeSellerScrape::where('seller', $seller)
|
||||
->where('country_iso', $country_iso)->first();
|
||||
|
||||
if (! is_null($shopee_seller_scrape)) {
|
||||
$serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename);
|
||||
|
||||
if (! is_empty($serialised)) {
|
||||
$obj = unserialize($serialised);
|
||||
$obj->shopee_seller_scrape = $shopee_seller_scrape;
|
||||
|
||||
return $obj;
|
||||
}
|
||||
}
|
||||
|
||||
$epoch = epoch_now_timestamp();
|
||||
|
||||
$seller_shop_url = "https://shopee.com.my/{$seller}?page=0&sortBy=sales";
|
||||
|
||||
$seller_shop_task = UrlCrawlerTask::handle($seller_shop_url, 'shopee/seller', $epoch, true, false);
|
||||
|
||||
//dd($seller_shop_task);
|
||||
|
||||
if (isset($seller_shop_task->response->jsonld)) {
|
||||
$top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 100);
|
||||
|
||||
if (count($top_rank_products) > 0) {
|
||||
|
||||
$product_found = null;
|
||||
|
||||
foreach ($top_rank_products as $product) {
|
||||
$product_task = UrlCrawlerTask::handle($product->url, 'shopee/seller', $epoch, true, true);
|
||||
|
||||
if ($product_task->response->status_code >= 0) {
|
||||
$product_found = $product_task->response;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$scraped = (object) [
|
||||
'seller_shop_task' => (object) [
|
||||
'response' => $seller_shop_task->response,
|
||||
],
|
||||
'product_task' => (object) [
|
||||
'response' => $product_task->response,
|
||||
],
|
||||
];
|
||||
|
||||
$serialised = serialize($scraped);
|
||||
|
||||
$filename = $seller.'-'.$epoch.'-'.$country_iso.'.txt';
|
||||
|
||||
OSSUploader::uploadFile('r2', 'shopee/seller', $filename, $serialised);
|
||||
|
||||
$shopee_seller_scrape = new ShopeeSellerScrape;
|
||||
$shopee_seller_scrape->seller = $seller;
|
||||
$shopee_seller_scrape->country_iso = $country_iso;
|
||||
$shopee_seller_scrape->epoch = $epoch;
|
||||
$shopee_seller_scrape->filename = $filename;
|
||||
$shopee_seller_scrape->category_id = $category->id;
|
||||
|
||||
if ($shopee_seller_scrape->save()) {
|
||||
return (object) compact('seller_shop_task', 'product_task', 'shopee_seller_scrape');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getSortedData($data, $minValue)
|
||||
{
|
||||
// Filter the items of type "Product" with an offer price greater than 200
|
||||
$filtered = array_filter($data, function ($item) use ($minValue) {
|
||||
$isProduct = $item->{'@type'} === 'Product';
|
||||
$lowPrice = floatval($item->offers?->lowPrice ?? 0);
|
||||
$price = floatval($item->offers?->price ?? 0);
|
||||
|
||||
return $isProduct && ($lowPrice > $minValue) || ($price > $minValue);
|
||||
});
|
||||
|
||||
// Sort the items based on `ratingCount` and `ratingValue` in descending order
|
||||
usort($filtered, function ($a, $b) {
|
||||
$ratingCountA = intval($a->aggregateRating?->ratingCount ?? 0);
|
||||
$ratingCountB = intval($b->aggregateRating?->ratingCount ?? 0);
|
||||
|
||||
$ratingValueA = floatval($a->aggregateRating?->ratingValue ?? 0);
|
||||
$ratingValueB = floatval($b->aggregateRating?->ratingValue ?? 0);
|
||||
|
||||
if ($ratingCountA !== $ratingCountB) {
|
||||
return $ratingCountB - $ratingCountA;
|
||||
}
|
||||
|
||||
return $ratingValueB <=> $ratingValueA;
|
||||
});
|
||||
|
||||
// Map the filtered and sorted items to a new array of objects
|
||||
return array_map(function ($item) {
|
||||
return (object) [
|
||||
'name' => $item->name ?? null,
|
||||
'description' => $item->description ?? null,
|
||||
'url' => $item->url ?? null,
|
||||
'image' => $item->image ?? null,
|
||||
'lowPrice' => floatval($item->offers?->lowPrice ?? 0),
|
||||
'highPrice' => floatval($item->offers?->highPrice ?? 0),
|
||||
'price' => floatval($item->offers?->price ?? 0),
|
||||
'priceCurrency' => $item->offers?->priceCurrency ?? null,
|
||||
'ratingCount' => intval($item->aggregateRating?->ratingCount ?? 0),
|
||||
'ratingValue' => floatval($item->aggregateRating?->ratingValue ?? 0),
|
||||
];
|
||||
}, $filtered);
|
||||
}
|
||||
}
|
||||
448
app/Jobs/Tasks/UrlCrawlerTask.php
Normal file
448
app/Jobs/Tasks/UrlCrawlerTask.php
Normal file
@@ -0,0 +1,448 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Intervention\Image\Facades\Image;
|
||||
use Minifier\TinyMinify;
|
||||
use Spatie\Browsershot\Browsershot;
|
||||
use Spatie\Browsershot\Exceptions\UnsuccessfulResponse;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||
|
||||
class UrlCrawlerTask
|
||||
{
|
||||
public static function handle(string $url, $directory, $postfix = null, $strip_html = false, $parse_images = false)
|
||||
{
|
||||
$slug = str_slug($url);
|
||||
|
||||
$cached_url = self::getGoogleCachedUrl($url, false);
|
||||
|
||||
$postfix = strval($postfix);
|
||||
|
||||
$driver = 'r2';
|
||||
$filename = $slug.'-'.$postfix.'.html';
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
$disk_url = $directory.$filename;
|
||||
|
||||
$raw_html = null;
|
||||
$status_code = 0;
|
||||
|
||||
$costs = [];
|
||||
|
||||
$main_intervention_image = null;
|
||||
$intervention_images = [];
|
||||
|
||||
$proxy_server = get_smartproxy_server();
|
||||
|
||||
try {
|
||||
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
|
||||
|
||||
if (is_null($raw_html)) {
|
||||
$status_code = -1;
|
||||
throw new Exception('Not stored.');
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
$raw_html = null;
|
||||
}
|
||||
|
||||
if (is_null($raw_html)) {
|
||||
|
||||
try {
|
||||
$browsershot = new Browsershot();
|
||||
|
||||
$browsershot->setUrl($cached_url)
|
||||
->setOption('args', ['headless: "new"'])
|
||||
->noSandbox()
|
||||
->setOption('args', ['--disable-web-security'])
|
||||
->userAgent($user_agent)
|
||||
->ignoreHttpsErrors()
|
||||
->preventUnsuccessfulResponse()
|
||||
->timeout(10)
|
||||
//->setProxyServer($proxy_server)
|
||||
->userAgent($user_agent);
|
||||
|
||||
if (app()->environment() == 'local') {
|
||||
$browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
|
||||
}
|
||||
|
||||
//dump($browsershot);
|
||||
|
||||
$raw_html = $browsershot->bodyHtml();
|
||||
|
||||
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
|
||||
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
|
||||
|
||||
// $costs['html'] = $browsershot_cost;
|
||||
|
||||
} catch (UnsuccessfulResponse|Exception $e) {
|
||||
$raw_html = null;
|
||||
$status_code = -3;
|
||||
}
|
||||
|
||||
if (! is_empty($raw_html)) {
|
||||
OSSUploader::uploadFile($driver, $directory, $filename, $raw_html);
|
||||
$status_code = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (! is_null($raw_html)) {
|
||||
|
||||
$raw_html = self::minifyAndCleanHtml($raw_html);
|
||||
|
||||
$jsonld = self::getJsonLd($raw_html);
|
||||
|
||||
if ($parse_images) {
|
||||
$images = self::getImages($raw_html);
|
||||
$images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images);
|
||||
} else {
|
||||
$images = [];
|
||||
}
|
||||
|
||||
$main_image = self::getProductImage($jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image);
|
||||
|
||||
return (object) [
|
||||
'intervention' => (object) compact('main_intervention_image', 'intervention_images'),
|
||||
'response' => (object) [
|
||||
'url' => $url,
|
||||
'postfix' => $postfix,
|
||||
'filename' => $disk_url,
|
||||
'raw_html' => $raw_html,
|
||||
'jsonld' => $jsonld,
|
||||
'main_image' => $main_image,
|
||||
'images' => $images,
|
||||
'status_code' => $status_code,
|
||||
'costs' => $costs,
|
||||
'total_cost' => array_sum(array_values($costs)),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
return (object) [
|
||||
'response' => (object) [
|
||||
'url' => $url,
|
||||
'postfix' => $postfix,
|
||||
'filename' => null,
|
||||
'raw_html' => null,
|
||||
'jsonld' => [],
|
||||
'main_image' => null,
|
||||
'images' => [],
|
||||
'status_code' => $status_code,
|
||||
'costs' => $costs,
|
||||
'total_cost' => 0,
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private static function getJsonLd(string $raw_html)
|
||||
{
|
||||
$crawler = new Crawler($raw_html);
|
||||
|
||||
try {
|
||||
$jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) {
|
||||
return $node->text();
|
||||
});
|
||||
} catch (Exception $e) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$contents = [];
|
||||
|
||||
foreach ($jsonld as $content) {
|
||||
try {
|
||||
$contents[] = json_decode($content);
|
||||
} catch (Exception $e) {
|
||||
}
|
||||
}
|
||||
|
||||
return $contents;
|
||||
}
|
||||
|
||||
private static function getImages(string $raw_html)
|
||||
{
|
||||
$crawler = new Crawler($raw_html);
|
||||
$images = [];
|
||||
|
||||
$crawler->filter('img')->each(function ($node) use (&$images) {
|
||||
$src = $node->attr('src');
|
||||
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
|
||||
$images[] = [
|
||||
'src' => $src,
|
||||
'alt' => $alt,
|
||||
];
|
||||
});
|
||||
|
||||
// if (count($images) > 4)
|
||||
// {
|
||||
// return $images;
|
||||
// }
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images)
|
||||
{
|
||||
$filteredImages = [];
|
||||
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
|
||||
|
||||
$count = 0;
|
||||
|
||||
foreach ($images as $image) {
|
||||
$count++;
|
||||
|
||||
$src = $image['src'];
|
||||
|
||||
try {
|
||||
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src);
|
||||
|
||||
// Check if the request was successful
|
||||
if (! $response->successful()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$imageData = $response->body();
|
||||
|
||||
// Create an Intervention Image instance from the response data
|
||||
$interventionImage = Image::make($imageData);
|
||||
|
||||
$width = $interventionImage->width();
|
||||
$height = $interventionImage->height();
|
||||
$mime = $interventionImage->mime();
|
||||
|
||||
// Image size in KB
|
||||
$sizeKb = round(strlen($imageData) / 1024, 2);
|
||||
|
||||
// Check constraints
|
||||
if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') {
|
||||
continue;
|
||||
}
|
||||
$image['width'] = $width;
|
||||
$image['height'] = $height;
|
||||
$image['mime'] = $mime;
|
||||
$image['sizeKb'] = $sizeKb;
|
||||
|
||||
// Check for duplicates by searching through uniqueAttributes
|
||||
$isDuplicate = false;
|
||||
foreach ($uniqueAttributes as $attr) {
|
||||
if (
|
||||
$attr['width'] == $width &&
|
||||
$attr['height'] == $height &&
|
||||
$attr['mime'] == $mime &&
|
||||
abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance
|
||||
) {
|
||||
$isDuplicate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (! $isDuplicate) {
|
||||
$uniqueAttributes[] = [
|
||||
'width' => $width,
|
||||
'height' => $height,
|
||||
'mime' => $mime,
|
||||
'sizeKb' => $sizeKb,
|
||||
];
|
||||
$image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage);
|
||||
//$image['img'] = $interventionImage;
|
||||
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb);
|
||||
|
||||
$filteredImages[] = $image;
|
||||
|
||||
$intervention_images[] = (object) [
|
||||
'image' => $interventionImage,
|
||||
'original_name' => pathinfo($src, PATHINFO_BASENAME),
|
||||
];
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// Handle exceptions related to the HTTP request
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all the color counts
|
||||
$colorCounts = [];
|
||||
foreach ($filteredImages as $image) {
|
||||
$colorCounts[] = $image['color_counts'];
|
||||
}
|
||||
|
||||
// Compute the median of the color counts
|
||||
sort($colorCounts);
|
||||
$count = count($colorCounts);
|
||||
$middleIndex = floor($count / 2);
|
||||
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
|
||||
|
||||
// Use the median to filter out the low outliers
|
||||
$threshold = 0.10 * $median; // Adjust this percentage as needed
|
||||
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
|
||||
return $image['color_counts'] > $threshold;
|
||||
});
|
||||
|
||||
usort($filteredImages, function ($a, $b) {
|
||||
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
|
||||
});
|
||||
|
||||
return $filteredImages;
|
||||
}
|
||||
|
||||
// private static function isImageMostlyText($imageData, $mime) {
|
||||
// try {
|
||||
// $text = (new TesseractOCR)->imageData($imageData, $mime)->run();
|
||||
// $textLength = strlen($text);
|
||||
|
||||
// // This is a basic check. Adjust the threshold as needed.
|
||||
// return $textLength > 50;
|
||||
// } catch (\Exception $e) {
|
||||
// // Handle any exceptions related to Tesseract OCR
|
||||
// return false;
|
||||
// }
|
||||
// }
|
||||
|
||||
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image)
|
||||
{
|
||||
foreach ($jsonLdData as $data) {
|
||||
// Ensure the type is "Product" before proceeding
|
||||
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
||||
if (isset($data->url) && isset($data->image)) {
|
||||
try {
|
||||
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
|
||||
|
||||
// Check if the request was successful
|
||||
if ($response->successful()) {
|
||||
$imageData = $response->body();
|
||||
|
||||
// Create an Intervention Image instance from the response data
|
||||
$interventionImage = Image::make($imageData);
|
||||
|
||||
// Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed
|
||||
$interventionImage->fit(1920, 1080, function ($constraint) {
|
||||
$constraint->upsize();
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
|
||||
$sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes
|
||||
|
||||
// Calculate the cost
|
||||
$cost = calculate_smartproxy_cost($sizeInKb);
|
||||
|
||||
$costs['product_image'] = $cost;
|
||||
|
||||
$main_intervention_image = (object) [
|
||||
'image' => $interventionImage,
|
||||
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
|
||||
];
|
||||
|
||||
return [
|
||||
'url' => $data->url,
|
||||
//'img' => $interventionImage,
|
||||
'cost' => $cost,
|
||||
];
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// Handle exceptions related to the HTTP request
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function isMostlyTextBasedOnUniqueColors($interventionImage)
|
||||
{
|
||||
// Use Intervention to manipulate the image
|
||||
$img = clone $interventionImage;
|
||||
|
||||
// Resize to a smaller dimension for faster processing (maintaining aspect ratio)
|
||||
$img->resize(200, null, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
|
||||
// Apply some blur
|
||||
$img->blur(10);
|
||||
|
||||
$im = imagecreatefromstring($img->encode());
|
||||
|
||||
$width = imagesx($im);
|
||||
$height = imagesy($im);
|
||||
|
||||
$uniqueColors = [];
|
||||
|
||||
for ($x = 0; $x < $width; $x++) {
|
||||
for ($y = 0; $y < $height; $y++) {
|
||||
$rgb = imagecolorat($im, $x, $y);
|
||||
$uniqueColors[$rgb] = true;
|
||||
}
|
||||
}
|
||||
|
||||
imagedestroy($im);
|
||||
|
||||
// Adjust the threshold based on your dataset.
|
||||
// Here, I'm assuming that images with less than 100 unique colors are mostly text
|
||||
// because we've reduced the image size and applied blurring.
|
||||
return count($uniqueColors);
|
||||
}
|
||||
|
||||
private static function minifyAndCleanHtml(string $raw_html)
|
||||
{
|
||||
$raw_html = TinyMinify::html($raw_html);
|
||||
|
||||
$crawler = new Crawler($raw_html);
|
||||
|
||||
// Directly loop through the DOM and remove 'class' and 'id' attributes
|
||||
foreach ($crawler as $domElement) {
|
||||
/** @var \DOMNodeList $nodes */
|
||||
$nodes = $domElement->getElementsByTagName('*');
|
||||
foreach ($nodes as $node) {
|
||||
/** @var \DOMElement $node */
|
||||
$node->removeAttribute('class');
|
||||
$node->removeAttribute('id');
|
||||
$node->removeAttribute('style');
|
||||
}
|
||||
}
|
||||
|
||||
// Remove <style> tags and their content
|
||||
$styleTags = $domElement->getElementsByTagName('style');
|
||||
for ($i = $styleTags->length; --$i >= 0;) {
|
||||
$styleNode = $styleTags->item($i);
|
||||
$styleNode->parentNode->removeChild($styleNode);
|
||||
}
|
||||
|
||||
// Output the manipulated HTML
|
||||
return $crawler->html();
|
||||
}
|
||||
|
||||
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
|
||||
{
|
||||
$url = self::stripUrlQueryParameters($url);
|
||||
$cached_url = "https://webcache.googleusercontent.com/search?q=cache:{$url}";
|
||||
|
||||
if ($stripHtml) {
|
||||
$cached_url .= '&strip=1';
|
||||
}
|
||||
|
||||
return $cached_url;
|
||||
|
||||
}
|
||||
|
||||
private static function stripUrlQueryParameters(string $url)
|
||||
{
|
||||
// Parse the URL into its components
|
||||
$parts = parse_url($url);
|
||||
|
||||
// Rebuild the URL without the query component
|
||||
$newUrl = $parts['scheme'].'://'.$parts['host'];
|
||||
|
||||
if (isset($parts['path'])) {
|
||||
$newUrl .= $parts['path'];
|
||||
}
|
||||
|
||||
if (isset($parts['fragment'])) {
|
||||
$newUrl .= '#'.$parts['fragment'];
|
||||
}
|
||||
|
||||
return $newUrl;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user