Add (ai gen)

This commit is contained in:
2023-10-01 04:17:49 +08:00
parent 5fcfa75d97
commit 5b4a02778e
7 changed files with 191 additions and 84 deletions

View File

@@ -17,6 +17,7 @@
use LaravelFreelancerNL\LaravelIndexNow\Facades\IndexNow;
use LaravelGoogleIndexing;
use Masterminds\HTML5;
use Symfony\Component\DomCrawler\Crawler;
class GenerateShopeeAIArticleTask
{
@@ -48,6 +49,8 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
if (is_null($ai_writeup)) {
$ai_output = OpenAI::writeProductArticle($excerpt, $photos);
//dd($ai_output);
if (is_null($ai_output)) {
$e = new Exception('Failed to write: Missing ai_output');
@@ -140,41 +143,58 @@ private static function getTotalServiceCost($shopee_task)
private static function stripHtml(string $raw_html)
{
$r_configuration = new ReadabilityConfiguration();
$r_configuration->setWordThreshold(20);
$readability = new Readability($r_configuration);
$html_content = '';
// try {
// $readability->parse($raw_html);
try {
// $html_content = $readability->getContent();
$r_configuration = new ReadabilityConfiguration();
$r_configuration->setWordThreshold(20);
// // Remove tabs
// $html_content = str_replace("\t", '', $html_content);
$readability = new Readability($r_configuration);
// // Replace newlines with spaces
// $html_content = str_replace(["\n", "\r\n"], ' ', $html_content);
$readability->parse($raw_html);
// // Replace multiple spaces with a single space
// $html_content = preg_replace('/\s+/', ' ', $html_content);
$temp_html_content = $readability->getContent();
// // Output the cleaned text
// $html_content = trim($html_content); // Using trim to remove any leading or trailing spaces
// Remove tabs
$temp_html_content = str_replace("\t", '', $temp_html_content);
// $html_content = strip_tags($html_content);
// Replace newlines with spaces
$temp_html_content = str_replace(["\n", "\r\n"], ' ', $temp_html_content);
// } catch (ReadabilityParseException|Exception $e) {
// Replace multiple spaces with a single space
$temp_html_content = preg_replace('/\s+/', ' ', $temp_html_content);
$html5 = new HTML5(['preserveWhiteSpace' => true]);
// Output the cleaned text
$temp_html_content = trim($temp_html_content); // Using trim to remove any leading or trailing spaces
// Parse the HTML into a DOM tree.
$dom = $html5->loadHTML($raw_html);
$temp_html_content = strip_tags($temp_html_content);
// Serialize the DOM tree back to a string, formatted.
$html_content = strip_tags($html5->saveHTML($dom));
$crawler = new Crawler($raw_html);
// }
// Extract meta title
$title = $crawler->filter('title')->text(); // This assumes <title> tags are used for titles.
// Extract meta description
$metaDescriptionNode = $crawler->filter('meta[name="description"]');
$description = $metaDescriptionNode->count() > 0 ? $metaDescriptionNode->attr('content') : null;
$html_content .= $title.' ';
$html_content .= $description.' ';
$html_content .= $temp_html_content;
} catch (ReadabilityParseException|Exception $e) {
$html5 = new HTML5(['preserveWhiteSpace' => true]);
// Parse the HTML into a DOM tree.
$dom = $html5->loadHTML($raw_html);
// Serialize the DOM tree back to a string, formatted.
$html_content = strip_tags($html5->saveHTML($dom));
}
return $html_content;
}

View File

@@ -19,7 +19,9 @@ public static function handle($shopee_task)
$main_image_url = null;
$proxy_server = get_smartproxy_server();
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
$user_agent = config('platform.proxy.user_agent');
///////// PART 1
@@ -36,7 +38,7 @@ public static function handle($shopee_task)
$intervention_images = $shopee_task->product_task->intervention->intervention_images;
} else {
$images = self::getImages($shopee_task->product_task->response->raw_html);
$images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images);
$images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images);
}
///////// PART 2
@@ -54,7 +56,7 @@ public static function handle($shopee_task)
$scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
if (is_null($scraped_image)) {
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image);
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image);
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_intervention_image, true);
}
@@ -137,16 +139,18 @@ private static function getImages(string $raw_html)
$crawler->filter('img')->each(function ($node) use (&$images) {
$src = $node->attr('src');
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
$images[] = [
'src' => $src,
'alt' => $alt,
];
});
// if (count($images) > 4)
// {
// return $images;
// }
$blacklist_domain = [];
foreach ($blacklist_domain as $blacklist) {
if (! str_contains($src, $blacklist)) {
$images[] = [
'src' => $src,
'alt' => $alt,
];
}
}
});
return $images;
}
@@ -164,7 +168,7 @@ private static function filterImages(array $images, string $proxy, string $user_
$src = $image['src'];
try {
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src);
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src);
// Check if the request was successful
if (! $response->successful()) {
@@ -274,7 +278,7 @@ private static function getProductImage(array $jsonLdData, string $proxy, string
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
if (isset($data->url) && isset($data->image)) {
try {
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
// Check if the request was successful
if ($response->successful()) {

View File

@@ -41,7 +41,7 @@ public static function handle(string $seller, string $country_iso, Category $cat
//dd($seller_shop_task);
if (isset($seller_shop_task->response->jsonld)) {
$top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 100);
$top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 400);
if (count($top_rank_products) > 0) {

View File

@@ -18,7 +18,7 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
{
$slug = str_slug($url);
$cached_url = self::getGoogleCachedUrl($url, false);
$cached_url = $url; // self::getGoogleCachedUrl($url, false);
$postfix = strval($postfix);
@@ -35,7 +35,8 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
$main_intervention_image = null;
$intervention_images = [];
$proxy_server = get_smartproxy_server();
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
try {
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
@@ -51,26 +52,47 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
if (is_null($raw_html)) {
try {
$browsershot = new Browsershot();
$browsershot->setUrl($cached_url)
->setOption('args', ['headless: "new"'])
->noSandbox()
->setOption('args', ['--disable-web-security'])
->userAgent($user_agent)
->ignoreHttpsErrors()
->preventUnsuccessfulResponse()
->timeout(10)
//->setProxyServer($proxy_server)
->userAgent($user_agent);
$response = Http::withHeaders([
'User-Agent' => $user_agent,
])
->withOptions([
'proxy' => $unblocker_proxy_server,
'timeout' => 1000,
'verify' => false,
])
->get($cached_url);
if (app()->environment() == 'local') {
$browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
if ($response->successful()) {
$raw_html = $response->body();
// ... your logic here ...
} else {
$raw_html = null;
$status_code = -3;
//throw new Exception('Http response failed');
$response->throw();
}
//dump($browsershot);
// $browsershot = new Browsershot();
$raw_html = $browsershot->bodyHtml();
// $browsershot->setUrl($cached_url)
// ->setOption('args', ['headless: "new"'])
// ->noSandbox()
// ->setOption('args', ['--disable-web-security'])
// ->userAgent($user_agent)
// ->ignoreHttpsErrors()
// ->preventUnsuccessfulResponse()
// ->timeout(10)
// ->setProxyServer($proxy_server)
// ->userAgent($user_agent);
// if (app()->environment() == 'local') {
// $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
// }
// //dump($browsershot);
// $raw_html = $browsershot->bodyHtml();
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
@@ -97,12 +119,12 @@ public static function handle(string $url, $directory, $postfix = null, $strip_h
if ($parse_images) {
$images = self::getImages($raw_html);
$images = self::filterImages($images, $proxy_server, $user_agent, $costs, $intervention_images);
$images = self::filterImages($images, $rotating_proxy_server, $user_agent, $costs, $intervention_images);
} else {
$images = [];
}
$main_image = self::getProductImage($jsonld, $proxy_server, $user_agent, $costs, $main_intervention_image);
$main_image = self::getProductImage($jsonld, $rotating_proxy_server, $user_agent, $costs, $main_intervention_image);
return (object) [
'intervention' => (object) compact('main_intervention_image', 'intervention_images'),
@@ -169,16 +191,20 @@ private static function getImages(string $raw_html)
$crawler->filter('img')->each(function ($node) use (&$images) {
$src = $node->attr('src');
$alt = $node->attr('alt') ?? null; // Setting a default value if alt is not present
$images[] = [
'src' => $src,
'alt' => $alt,
];
$blacklist_domain = [];
foreach ($blacklist_domain as $blacklist) {
if (! str_contains($src, $blacklist)) {
$images[] = [
'src' => $src,
'alt' => $alt,
];
}
}
});
// if (count($images) > 4)
// {
// return $images;
// }
//dd($images);
return $images;
}
@@ -196,7 +222,8 @@ private static function filterImages(array $images, string $proxy, string $user_
$src = $image['src'];
try {
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($src);
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src);
// Check if the request was successful
if (! $response->successful()) {
@@ -216,7 +243,7 @@ private static function filterImages(array $images, string $proxy, string $user_
$sizeKb = round(strlen($imageData) / 1024, 2);
// Check constraints
if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') {
if ($width < 800 || $height < 800 || $sizeKb < 100) {
continue;
}
$image['width'] = $width;
@@ -268,17 +295,22 @@ private static function filterImages(array $images, string $proxy, string $user_
$colorCounts[] = $image['color_counts'];
}
// Compute the median of the color counts
sort($colorCounts);
$count = count($colorCounts);
$middleIndex = floor($count / 2);
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
if (! empty($colorCounts)) {
// Compute the median of the color counts
sort($colorCounts);
$count = count($colorCounts);
$middleIndex = floor($count / 2);
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
// Use the median to filter out the low outliers
$threshold = 0.10 * $median; // Adjust this percentage as needed
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
return $image['color_counts'] > $threshold;
});
// Use the median to filter out the low outliers
$threshold = 0.10 * $median; // Adjust this percentage as needed
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
return $image['color_counts'] > $threshold;
});
} else {
// No images found
$filteredImages = []; // Clear the array or take any other appropriate action
}
usort($filteredImages, function ($a, $b) {
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
@@ -307,7 +339,7 @@ private static function getProductImage(array $jsonLdData, string $proxy, string
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
if (isset($data->url) && isset($data->image)) {
try {
$response = Http::withOptions(['proxy' => $proxy])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
// Check if the request was successful
if ($response->successful()) {