Add (ai gen)

This commit is contained in:
2023-10-01 20:39:18 +08:00
parent c68bda42c5
commit b03ceb3868
4 changed files with 105 additions and 342 deletions

View File

@@ -20,7 +20,7 @@ public static function writeProductArticle($excerpt, $photos)
- Use the following sections whenever applicable:\n
-- ### Introduction\n
-- ### Overview\n
-- ### Specifications (use valid Markdown table format with header and seperator when possible) and explanation\n
-- ### Specifications (use valid Markdown table format with header and seperator) + explanation\n
-- ### Price\n
-- ### Should I Buy?\n
- do not make up facts, use facts provided by excerpt only\n

View File

@@ -34,6 +34,8 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
$shopee_task->shopee_seller_scrape = $shopee_seller_scrape;
}
// dd($shopee_task);
// dd($shopee_task->product_task->response);
@@ -42,6 +44,8 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
$excerpt = self::stripHtml($raw_html);
$excerpt .= self::getProductPricingExcerpt($shopee_task->product_task->response->jsonld);
$photos = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', false)->orderByRaw('RAND()')->take(3)->get()->pluck('image')->toArray();
$ai_writeup = AiWriteup::where('source', 'shopee')->where('source_url', $shopee_task->product_task->response->url)->first();
@@ -120,12 +124,38 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
return $post;
}
private static function getProductPricingExcerpt(array $jsonLdData)
{
foreach ($jsonLdData as $data) {
// Ensure the type is "Product" before proceeding
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
// Extract necessary data
$lowPrice = $data->offers->lowPrice ?? null;
$highPrice = $data->offers->highPrice ?? null;
$price = $data->offers->price ?? null;
$currency = $data->offers->priceCurrency ?? null;
$sellerName = $data->offers->seller->name ?? "online store"; // default to "online store" if name is not set
// Determine and format pricing sentence
if ($lowPrice && $highPrice) {
return "Price Range from {$currency} {$lowPrice} to {$highPrice} in {$sellerName}";
} elseif ($price) {
return "Priced at {$currency} {$price} in {$sellerName}";
} else {
return "Price not stated, refer to {$sellerName}";
}
}
}
}
private static function getTotalServiceCost($shopee_task)
{
$cost = 0.00;
$cost += 0.06; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens
$cost += 0.09; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens
// Shopee Seller Scraping
if (isset($shopee_task?->seller_shop_task?->response?->total_cost)) {

View File

@@ -8,68 +8,46 @@
use Illuminate\Support\Str;
use Intervention\Image\Facades\Image;
use Symfony\Component\DomCrawler\Crawler;
use Masterminds\HTML5;
class SaveShopeeSellerImagesTask
{
public static function handle($shopee_task)
{
$main_intervention_image = null;
$intervention_images = [];
$costs = [];
$main_image_url = null;
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
$costs = [];
$user_agent = config('platform.proxy.user_agent');
///////// PART 1
// If there is a main intervention image, then set in, else get the url only.
if (isset($shopee_task?->product_task?->intervention?->main_intervention_image)) {
$main_intervention_image = $shopee_task->product_task->intervention->main_intervention_image;
} else {
$main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld);
}
// If there is other image interventions set, then set in, else get the image urls only.
if (isset($shopee_task?->product_task?->intervention?->intervention_images)) {
$intervention_images = $shopee_task->product_task->intervention->intervention_images;
} else {
$images = self::getImages($shopee_task->product_task->response->raw_html);
$images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images);
}
///////// PART 2
// Check existence and upload if image intervention is set
if (! is_null($main_intervention_image)) {
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $main_intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
if (is_null($scraped_image)) {
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true);
}
}
// if there is no main image intervention but the main image url is provided
elseif (! is_empty($main_image_url)) {
if (! is_empty($main_image_url)) {
$scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
if (is_null($scraped_image)) {
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image);
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs);
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true);
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_image, true);
}
}
/////// PART 3
/////// PART 2
if (! is_null($intervention_images) && is_array($intervention_images) && count($intervention_images) > 0) {
foreach ($intervention_images as $intervention_image) {
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $intervention_image->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
$images = self::getFilteredImages($shopee_task->product_task->response->raw_html, $rotating_proxy_server, $user_agent, $costs);
//dd($images);
if (! is_null($images) && is_array($images) && count($images) > 0) {
foreach ($images as $image_obj) {
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $image_obj->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
if (is_null($scraped_image)) {
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $intervention_image, false);
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $image_obj, false);
}
}
}
@@ -78,7 +56,7 @@ public static function handle($shopee_task)
}
private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $intervention_image, $featured = false)
private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $image_obj, $featured = false)
{
// Generate a unique filename for the uploaded file and LQIP version
$uuid = Str::uuid()->toString();
@@ -86,7 +64,7 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv
$lqipFileName = time().'_'.$uuid.'_lqip.jpg';
// Convert the file to JPEG format using Intervention Image library
$image = $intervention_image->image;
$image = $image_obj->intervention;
// Get the original image width and height
$originalWidth = $image->width();
@@ -102,7 +80,7 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv
// Save the original image to a temporary file and open it again
$tempImagePath = tempnam(sys_get_temp_dir(), 'temp_image');
file_put_contents($tempImagePath, $intervention_image->image->encode());
file_put_contents($tempImagePath, $image_obj->intervention->encode());
$clonedImage = Image::make($tempImagePath);
// Create the LQIP version of the image using a small size while maintaining the aspect ratio
@@ -120,7 +98,7 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv
$scraped_image = new ShopeeSellerScrapedImage;
$scraped_image->shopee_seller_scrape_id = $shopee_seller_scrape->id;
$scraped_image->original_name = $intervention_image->original_name;
$scraped_image->original_name = $image_obj->original_name;
$scraped_image->image = $url;
$scraped_image->featured = $featured;
@@ -131,32 +109,36 @@ private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $interv
return null;
}
private static function getImages(string $raw_html)
private static function getImageUrls(string $raw_html)
{
$crawler = new Crawler($raw_html);
$images = [];
$crawler->filter('img')->each(function ($node) use (&$images) {
$src = $node->attr('src');
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
// Pattern for extracting src and alt attributes from img tags
$pattern = '/<img\s.*?(?:src=["\'](.*?)["\']).*?(?:alt=["\'](.*?)["\'])?[^>]*>/is';
$blacklist_domain = [];
if (preg_match_all($pattern, $raw_html, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$src = $match[1];
// Check if image file name ends with '_tn' and remove it
$src = preg_replace('/_tn(\.[a-z]+)?$/i', '$1', $src);
foreach ($blacklist_domain as $blacklist) {
if (! str_contains($src, $blacklist)) {
$images[] = [
'src' => $src,
'alt' => $alt,
'alt' => isset($match[2]) ? $match[2] : null,
];
}
}
});
return $images;
}
private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images)
private static function getFilteredImages(string $raw_html, string $proxy, string $user_agent, &$costs)
{
$images = self::getImageUrls($raw_html);
//dd($images);
$filteredImages = [];
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
@@ -188,7 +170,7 @@ private static function filterImages(array $images, string $proxy, string $user_
$sizeKb = round(strlen($imageData) / 1024, 2);
// Check constraints
if ($width < 300 || $height < 300) {
if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') {
continue;
}
@@ -225,16 +207,15 @@ private static function filterImages(array $images, string $proxy, string $user_
'mime' => $mime,
'sizeKb' => $sizeKb,
];
$image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage);
$image['color_counts'] = self::getImageColorCounts($interventionImage);
$image['intervention'] = $interventionImage;
$image['original_name'] = pathinfo($src, PATHINFO_BASENAME);
//$image['img'] = $interventionImage;
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb, 'rotating_global');
$filteredImages[] = $image;
$intervention_images[] = (object) [
'image' => $interventionImage,
'original_name' => pathinfo($src, PATHINFO_BASENAME),
];
}
} catch (\Exception $e) {
// Handle exceptions related to the HTTP request
@@ -269,7 +250,14 @@ private static function filterImages(array $images, string $proxy, string $user_
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
});
return $filteredImages;
$final_images = [];
foreach ($filteredImages as $image_obj)
{
$final_images[] = (object) $image_obj;
}
return $final_images;
}
private static function getProductImageUrl(array $jsonLdData)
@@ -284,7 +272,7 @@ private static function getProductImageUrl(array $jsonLdData)
}
}
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image)
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs)
{
foreach ($jsonLdData as $data) {
// Ensure the type is "Product" before proceeding
@@ -313,14 +301,10 @@ private static function getProductImage(array $jsonLdData, string $proxy, string
$costs['product_image'] = $cost;
$main_intervention_image = (object) [
'image' => $interventionImage,
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
];
return [
return (object) [
'url' => $data->url,
//'img' => $interventionImage,
'intervention' => $interventionImage,
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
'cost' => $cost,
];
}
@@ -335,7 +319,7 @@ private static function getProductImage(array $jsonLdData, string $proxy, string
return null;
}
private static function isMostlyTextBasedOnUniqueColors($interventionImage)
private static function getImageColorCounts($interventionImage)
{
// Use Intervention to manipulate the image
$img = clone $interventionImage;

View File

@@ -32,9 +32,6 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
$costs = [];
$main_intervention_image = null;
$intervention_images = [];
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
@@ -63,14 +60,13 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
])
->get($cached_url);
if ($response->successful()) {
//$costs['html_proxy'] = calculate_smartproxy_cost()
if ($response->successful())
{
$raw_html = $response->body();
// ... your logic here ...
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'unblocker');
} else {
$raw_html = null;
$status_code = -3;
//throw new Exception('Http response failed');
$response->throw();
}
@@ -114,29 +110,17 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
if (! is_null($raw_html)) {
$raw_html = self::minifyAndCleanHtml($raw_html);
//$raw_html = self::minifyAndCleanHtml($raw_html);
$jsonld = self::getJsonLd($raw_html);
if ($parse_images) {
$images = self::getImages($raw_html);
$images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images);
} else {
$images = [];
}
$main_image = self::getProductImage($jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image);
return (object) [
'intervention' => (object) compact('main_intervention_image', 'intervention_images'),
'response' => (object) [
'url' => $url,
'postfix' => $postfix,
'filename' => $disk_url,
'raw_html' => $raw_html,
'jsonld' => $jsonld,
'main_image' => $main_image,
'images' => $images,
'status_code' => $status_code,
'costs' => $costs,
'total_cost' => array_sum(array_values($costs)),
@@ -151,8 +135,6 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
'filename' => null,
'raw_html' => null,
'jsonld' => [],
'main_image' => null,
'images' => [],
'status_code' => $status_code,
'costs' => $costs,
'total_cost' => 0,
@@ -184,252 +166,9 @@ private static function getJsonLd(string $raw_html)
return $contents;
}
private static function getImages(string $raw_html)
{
$crawler = new Crawler($raw_html);
$images = [];
$crawler->filter('img')->each(function ($node) use (&$images) {
$src = $node->attr('src');
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
$blacklist_domain = [];
foreach ($blacklist_domain as $blacklist) {
if (! str_contains($src, $blacklist)) {
$images[] = [
'src' => $src,
'alt' => $alt,
];
}
}
});
//dd($images);
return $images;
}
private static function filterImages(array $images, string $proxy, string $user_agent, &$costs, &$intervention_images)
{
$filteredImages = [];
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
$count = 0;
foreach ($images as $image) {
$count++;
$src = $image['src'];
try {
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src);
// Check if the request was successful
if (! $response->successful()) {
continue;
}
$imageData = $response->body();
// Create an Intervention Image instance from the response data
$interventionImage = Image::make($imageData);
$width = $interventionImage->width();
$height = $interventionImage->height();
$mime = $interventionImage->mime();
// Image size in KB
$sizeKb = round(strlen($imageData) / 1024, 2);
// Check constraints
if ($width < 300 || $height < 300) {
continue;
}
$interventionImage->resize(800, null, function ($constraint) {
$constraint->aspectRatio();
});
$width = $interventionImage->width();
$height = $interventionImage->height();
$mime = $interventionImage->mime();
$image['width'] = $width;
$image['height'] = $height;
$image['mime'] = $mime;
$image['sizeKb'] = $sizeKb;
// Check for duplicates by searching through uniqueAttributes
$isDuplicate = false;
foreach ($uniqueAttributes as $attr) {
if (
$attr['width'] == $width &&
$attr['height'] == $height &&
$attr['mime'] == $mime &&
abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance
) {
$isDuplicate = true;
break;
}
}
if (! $isDuplicate) {
$uniqueAttributes[] = [
'width' => $width,
'height' => $height,
'mime' => $mime,
'sizeKb' => $sizeKb,
];
$image['color_counts'] = self::isMostlyTextBasedOnUniqueColors($interventionImage);
//$image['img'] = $interventionImage;
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb, 'rotating_global');
$filteredImages[] = $image;
$intervention_images[] = (object) [
'image' => $interventionImage,
'original_name' => pathinfo($src, PATHINFO_BASENAME),
];
}
} catch (\Exception $e) {
// Handle exceptions related to the HTTP request
continue;
}
}
// Collect all the color counts
$colorCounts = [];
foreach ($filteredImages as $image) {
$colorCounts[] = $image['color_counts'];
}
if (! empty($colorCounts)) {
// Compute the median of the color counts
sort($colorCounts);
$count = count($colorCounts);
$middleIndex = floor($count / 2);
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
// Use the median to filter out the low outliers
$threshold = 0.10 * $median; // Adjust this percentage as needed
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
return $image['color_counts'] > $threshold;
});
} else {
// No images found
$filteredImages = []; // Clear the array or take any other appropriate action
}
usort($filteredImages, function ($a, $b) {
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
});
return $filteredImages;
}
// private static function isImageMostlyText($imageData, $mime) {
// try {
// $text = (new TesseractOCR)->imageData($imageData, $mime)->run();
// $textLength = strlen($text);
// // This is a basic check. Adjust the threshold as needed.
// return $textLength > 50;
// } catch (\Exception $e) {
// // Handle any exceptions related to Tesseract OCR
// return false;
// }
// }
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs, &$main_intervention_image)
{
foreach ($jsonLdData as $data) {
// Ensure the type is "Product" before proceeding
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
if (isset($data->url) && isset($data->image)) {
try {
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
// Check if the request was successful
if ($response->successful()) {
$imageData = $response->body();
// Create an Intervention Image instance from the response data
$interventionImage = Image::make($imageData);
// Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed
$interventionImage->fit(1920, 1080, function ($constraint) {
$constraint->upsize();
$constraint->aspectRatio();
});
$sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes
// Calculate the cost
$cost = calculate_smartproxy_cost($sizeInKb, 'rotating_global');
$costs['product_image'] = $cost;
$main_intervention_image = (object) [
'image' => $interventionImage,
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
];
return [
'url' => $data->url,
//'img' => $interventionImage,
'cost' => $cost,
];
}
} catch (\Exception $e) {
// Handle exceptions related to the HTTP request
return null;
}
}
}
}
return null;
}
private static function isMostlyTextBasedOnUniqueColors($interventionImage)
{
// Use Intervention to manipulate the image
$img = clone $interventionImage;
// Resize to a smaller dimension for faster processing (maintaining aspect ratio)
$img->resize(200, null, function ($constraint) {
$constraint->aspectRatio();
});
// Apply some blur
$img->blur(10);
$im = imagecreatefromstring($img->encode());
$width = imagesx($im);
$height = imagesy($im);
$uniqueColors = [];
for ($x = 0; $x < $width; $x++) {
for ($y = 0; $y < $height; $y++) {
$rgb = imagecolorat($im, $x, $y);
$uniqueColors[$rgb] = true;
}
}
imagedestroy($im);
// Adjust the threshold based on your dataset.
// Here, I'm assuming that images with less than 100 unique colors are mostly text
// because we've reduced the image size and applied blurring.
return count($uniqueColors);
}
private static function minifyAndCleanHtml(string $raw_html)
{
$raw_html = TinyMinify::html($raw_html);
$raw_html = self::minifyHTML($raw_html);
$crawler = new Crawler($raw_html);
@@ -456,6 +195,16 @@ private static function minifyAndCleanHtml(string $raw_html)
return $crawler->html();
}
private static function minifyHTML($input) {
// Remove extra white space between HTML tags
$input = preg_replace('/>\s+</', '><', $input);
// Remove comments
$input = preg_replace('/<!--(.|\s)*?-->/', '', $input);
return $input;
}
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
{
$url = self::stripUrlQueryParameters($url);