This commit is contained in:
2023-11-26 18:56:40 +08:00
parent be14f5fdb1
commit 64431e7a73
144 changed files with 497072 additions and 3730 deletions

View File

@@ -0,0 +1,37 @@
<?php
namespace App\Jobs;
use App\Jobs\Tasks\GetAIToolScreenshotTask;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
class GetAIToolScreenshotJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected $url_to_crawl_id;
protected $ai_tool_id;
/**
* Create a new job instance.
*/
public function __construct($url_to_crawl_id, $ai_tool_id)
{
$this->url_to_crawl_id = $url_to_crawl_id;
$this->ai_tool_id = $ai_tool_id;
}
/**
* Execute the job.
*/
public function handle(): void
{
GetAIToolScreenshotTask::handle($this->url_to_crawl_id, $this->ai_tool_id);
}
}

View File

@@ -0,0 +1,35 @@
<?php
namespace App\Jobs;
use App\Jobs\Tasks\GetUrlBodyTask;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
class GetUrlBodyJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected $url_to_crawl_id;
public $timeout = 60;
/**
* Create a new job instance.
*/
public function __construct($url_to_crawl_id)
{
$this->url_to_crawl_id = $url_to_crawl_id;
}
/**
* Execute the job.
*/
public function handle(): void
{
GetUrlBodyTask::handle($this->url_to_crawl_id);
}
}

View File

@@ -0,0 +1,35 @@
<?php
namespace App\Jobs;
use App\Jobs\Tasks\ParseUrlBodyTask;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
class ParseUrlBodyJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected $url_to_crawl_id;
public $timeout = 60;
/**
* Create a new job instance.
*/
public function __construct($url_to_crawl_id)
{
$this->url_to_crawl_id = $url_to_crawl_id;
}
/**
* Execute the job.
*/
public function handle(): void
{
ParseUrlBodyTask::handle($this->url_to_crawl_id);
}
}

View File

@@ -1,55 +0,0 @@
<?php
namespace App\Jobs;
use App\Jobs\Tasks\GenerateShopeeAIArticleTask;
use App\Jobs\Tasks\SaveShopeeSellerImagesTask;
use App\Jobs\Tasks\ShopeeSellerTopProductScraperTask;
use App\Models\Category;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
class ShopeeSellerTopProductScraperJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
public $timeout = 1000;
protected $seller;
protected $country_iso;
protected $category;
/**
* Create a new job instance.
*/
public function __construct(string $seller, string $country_iso, Category $category)
{
$this->seller = $seller;
$this->country_iso = $country_iso;
$this->category = $category;
}
/**
* Execute the job.
*/
public function handle(): void
{
$shopee_task = ShopeeSellerTopProductScraperTask::handle($this->seller, $this->country_iso, $this->category);
//dd($shopee_task->product_task);
if (! is_null($shopee_task)) {
SaveShopeeSellerImagesTask::handle($shopee_task);
GenerateShopeeAIArticleTask::handle($shopee_task->shopee_seller_scrape);
}
}
}

View File

@@ -0,0 +1,65 @@
<?php
namespace App\Jobs;
use App\Helpers\FirstParty\Aictio\Aictio;
use App\Models\SearchEmbedding;
use Exception;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
class StoreSearchEmbeddingJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected $type;
protected $category_id;
protected $ai_tool_id;
protected $query;
/**
* Create a new job instance.
*/
public function __construct($type, $category_id, $ai_tool_id, $query)
{
$this->type = $type;
$this->category_id = $category_id;
$this->ai_tool_id = $ai_tool_id;
$this->query = $query;
}
/**
* Execute the job.
*/
public function handle(): void
{
$embedding = Aictio::getVectorEmbedding(strtolower($this->query));
if (! is_null($embedding)) {
$search_embedding = SearchEmbedding::where('type', $this->type)
->where('category_id', $this->category_id)
->where('ai_tool_id', $this->ai_tool_id)
->where('query', $this->query)
->first();
if (is_null($search_embedding)) {
$search_embedding = new SearchEmbedding;
$search_embedding->type = $this->type;
$search_embedding->category_id = $this->category_id;
$search_embedding->ai_tool_id = $this->ai_tool_id;
$search_embedding->query = $this->query;
$search_embedding->embedding = $embedding;
$search_embedding->save();
}
} else {
throw new Exception('Failed vector embedding: '.$this->query);
}
}
}

View File

@@ -1,278 +0,0 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Models\AiWriteup;
use App\Models\Category;
use App\Models\Post;
use App\Models\PostCategory;
use App\Models\ShopeeSellerCategory;
use App\Models\ShopeeSellerScrape;
use App\Models\ShopeeSellerScrapedImage;
use Exception;
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
use fivefilters\Readability\ParseException as ReadabilityParseException;
use fivefilters\Readability\Readability;
use Illuminate\Support\Facades\Log;
use LaravelFreelancerNL\LaravelIndexNow\Facades\IndexNow;
use LaravelGoogleIndexing;
use Masterminds\HTML5;
use Symfony\Component\DomCrawler\Crawler;
class GenerateShopeeAIArticleTask
{
public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
{
$serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename);
$post = null;
$shopee_seller_scrape->load('category');
if (! is_empty($serialised)) {
$shopee_task = unserialize($serialised);
$shopee_task->shopee_seller_scrape = $shopee_seller_scrape;
}
// dd($shopee_task);
// dd($shopee_task->product_task->response);
$raw_html = $shopee_task->product_task->response->raw_html;
$excerpt = self::stripHtml($raw_html);
$excerpt = substr($excerpt, 0, 1500); // limit to 1500 (+1500 output token, total 3k token) characters due to OpenAI model limitations unless use 16k model, $$$$
$excerpt .= self::getProductPricingExcerpt($shopee_task->product_task->response->jsonld);
$photos = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', false)->orderByRaw('RAND()')->take(3)->get()->pluck('image')->toArray();
$ai_writeup = AiWriteup::where('source', 'shopee')->where('source_url', $shopee_task->product_task->response->url)->first();
if (is_null($ai_writeup)) {
$categories = [
'Beauty',
'Technology',
'Home & Living',
'Health',
'Fitness',
];
$ai_output = OpenAI::writeProductArticle($excerpt, $photos, $categories);
//dd($ai_output);
if (is_null($ai_output)) {
$e = new Exception('Failed to write: Missing ai_output');
Log::error(serialize($ai_writeup?->toArray()));
inspector()->reportException($e);
throw ($e);
} else {
$picked_category = Category::where('name', $ai_output->category)->where('country_locale_id', $shopee_seller_scrape->category->country_locale_id)->first();
if (is_null($picked_category)) {
$picked_category = $shopee_seller_scrape->category;
}
// save
$ai_writeup = new AiWriteup;
$ai_writeup->source = 'shopee';
$ai_writeup->source_url = $shopee_task->product_task->response->url;
$ai_writeup->category_id = $picked_category->id;
$ai_writeup->title = $ai_output->title;
$ai_writeup->excerpt = $ai_output->excerpt;
$ai_writeup->featured_image = '';
$ai_writeup->body = $ai_output->body;
$ai_writeup->cost = self::getTotalServiceCost($shopee_task);
$ai_writeup->editor_format = 'markdown';
if ($ai_writeup->save()) {
$featured_photo = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', true)->first();
// new post
$post_data = [
'publish_date' => now(),
'title' => $ai_writeup->title,
'slug' => str_slug($ai_writeup->title),
'excerpt' => $ai_writeup->excerpt,
'cliffhanger' => $ai_writeup->cliffhanger,
'author_id' => 1,
'featured' => false,
'featured_image' => $featured_photo->image,
'editor' => 'markdown',
'body' => $ai_writeup->body,
'post_format' => 'standard',
'status' => 'publish',
];
$post = Post::create($post_data);
if (! is_null($post)) {
$shopee_seller_scrape->write_counts = $shopee_seller_scrape->write_counts + 1;
$shopee_seller_scrape->last_ai_written_at = now();
$shopee_seller_scrape->save();
$shopee_seller_category = ShopeeSellerCategory::where('seller', $shopee_seller_scrape->seller)->first();
if (is_null($shopee_seller_category)) {
$shopee_seller_category = new ShopeeSellerCategory;
$shopee_seller_category->seller = $shopee_seller_scrape->seller;
$shopee_seller_category->category_id = $shopee_seller_scrape->category_id;
}
$shopee_seller_category->last_ai_written_at = $shopee_seller_scrape->last_ai_written_at;
$shopee_seller_category->write_counts = $shopee_seller_scrape->write_counts;
$shopee_seller_category->save();
PostCategory::create([
'post_id' => $post->id,
'category_id' => $picked_category->id,
]);
if (app()->environment() == 'production') {
if ($post->status == 'publish') {
$post_url = route('home.country.post', ['country' => $post->post_category?->category?->country_locale_slug, 'post_slug' => $post->slug]);
LaravelGoogleIndexing::create()->update($post_url);
IndexNow::submit($post_url);
}
}
}
}
}
} else {
$e = new Exception('Failed to write: ai_writeup found');
Log::error(serialize($ai_writeup?->toArray()));
inspector()->reportException($e);
throw ($e);
}
return $post;
}
private static function getProductPricingExcerpt(array $jsonLdData)
{
foreach ($jsonLdData as $data) {
// Ensure the type is "Product" before proceeding
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
// Extract necessary data
$lowPrice = $data->offers->lowPrice ?? null;
$highPrice = $data->offers->highPrice ?? null;
$price = $data->offers->price ?? null;
$currency = $data->offers->priceCurrency ?? null;
$sellerName = $data->offers->seller->name ?? 'online store'; // default to "online store" if name is not set
if (! is_empty($currency)) {
if ($currency == 'MYR') {
$currency = 'RM';
}
}
// Determine and format pricing sentence
if ($lowPrice && $highPrice) {
$lowPrice = number_format($lowPrice, 0);
$highPrice = number_format($highPrice, 0);
return "Price Range from {$currency} {$lowPrice} to {$highPrice} in {$sellerName} online store";
} elseif ($price) {
$price = number_format($price, 0);
return "Priced at {$currency} {$price} in {$sellerName} online store";
} else {
return "Price not stated, refer to {$sellerName} online store";
}
}
}
}
private static function getTotalServiceCost($shopee_task)
{
$cost = 0.00;
$cost += 0.09; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens
// Shopee Seller Scraping
if (isset($shopee_task?->seller_shop_task?->response?->total_cost)) {
$cost += $shopee_task?->seller_shop_task?->response?->total_cost;
}
// Shopee Product Scraping
if (isset($shopee_task?->product_task?->response?->total_cost)) {
$cost += $shopee_task?->product_task?->response?->total_cost;
}
return $cost;
}
private static function stripHtml(string $raw_html)
{
$html_content = '';
try {
$r_configuration = new ReadabilityConfiguration();
$r_configuration->setCharThreshold(20);
$readability = new Readability($r_configuration);
$readability->parse($raw_html);
$temp_html_content = $readability->getContent();
// Remove tabs
$temp_html_content = str_replace("\t", '', $temp_html_content);
// Replace newlines with spaces
$temp_html_content = str_replace(["\n", "\r\n"], ' ', $temp_html_content);
// Replace multiple spaces with a single space
$temp_html_content = preg_replace('/\s+/', ' ', $temp_html_content);
// Output the cleaned text
$temp_html_content = trim($temp_html_content); // Using trim to remove any leading or trailing spaces
$temp_html_content = strip_tags($temp_html_content);
$crawler = new Crawler($raw_html);
// Extract meta title
$title = $crawler->filter('title')->text(); // This assumes <title> tags are used for titles.
// Extract meta description
$metaDescriptionNode = $crawler->filter('meta[name="description"]');
$description = $metaDescriptionNode->count() > 0 ? $metaDescriptionNode->attr('content') : null;
$html_content .= $title.' ';
$html_content .= $description.' ';
$html_content .= $temp_html_content;
} catch (ReadabilityParseException|Exception $e) {
$html5 = new HTML5(['preserveWhiteSpace' => true]);
// Parse the HTML into a DOM tree.
$dom = $html5->loadHTML($raw_html);
// Serialize the DOM tree back to a string, formatted.
$html_content = strip_tags($html5->saveHTML($dom));
}
return $html_content;
}
}

View File

@@ -0,0 +1,87 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Models\AiTool;
use App\Models\BusinessProfile;
use App\Models\SerpUrl;
use App\Models\UrlToCrawl;
use Exception;
use Image;
use Spatie\Browsershot\Browsershot;
class GetAIToolScreenshotTask
{
public static function handle($url_to_crawl_id, $ai_tool_id)
{
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
if (is_null($url_to_crawl))
{
return ;
}
$ai_tool = AiTool::find($ai_tool_id);
if (is_null($ai_tool))
{
return ;
}
$userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36";
$browsershot = Browsershot::url($url_to_crawl->url)
->timeout(30)
->emulateMedia('screen')
->userAgent($userAgent)
->windowSize(1024, 576) // Set window size
->waitUntilNetworkIdle(); // Wait until all resources are loaded
if (app()->environment() == 'local') {
$browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
}
$retries = 0;
$maxRetries = 1;
while ($retries < $maxRetries) {
try {
$image_content = $browsershot->screenshot();
break; // Exit the loop if the screenshot method succeeds
} catch (Exception $e) {
$retries++;
if ($retries === $maxRetries) {
throw new Exception("Failed to take a screenshot after $maxRetries attempts: ".$e->getMessage(), 0, $e);
}
}
}
$image = Image::make($image_content)
->resize(1024, null, function ($constraint) {
$constraint->aspectRatio();
})
->stream('webp', 80);
$image_file_name = str_slug($ai_tool->tool_name).'-'.epoch_now_timestamp().'.webp';
$upload_status = OSSUploader::uploadFile(
config('platform.uploads.ai_tools.screenshot.driver'),
config('platform.uploads.ai_tools.screenshot.path'),
$image_file_name,
$image);
if ($upload_status) {
$ai_tool->screenshot_img = $image_file_name;
}
if ($ai_tool->isDirty()) {
$ai_tool->save();
}
return $ai_tool;
}
}

View File

@@ -0,0 +1,204 @@
<?php
namespace App\Jobs\Tasks;
use App\Jobs\ParseUrlBodyJob;
use App\Models\ServiceCostUsage;
use App\Models\UrlToCrawl;
use Exception;
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
use fivefilters\Readability\ParseException as ReadabilityParseException;
use fivefilters\Readability\Readability;
use Illuminate\Support\Facades\Http;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\Component\DomCrawler\Crawler;
class GetUrlBodyTask
{
public static function handle(int $url_to_crawl_id)
{
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
if (is_null($url_to_crawl)) {
return null;
}
$url_to_crawl->is_crawling = true;
$url_to_crawl->save();
$url_to_crawl->refresh();
try {
$user_agent = config('platform.proxy.user_agent');
$response = Http::withHeaders([
'User-Agent' => $user_agent,
])
->withOptions([
'proxy' => get_smartproxy_rotating_server(),
'timeout' => 10,
'verify' => false,
])
->get($url_to_crawl->url);
if ($response->successful()) {
$raw_html = $response->body();
$cost = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $cost;
$service_cost_usage->name = 'smartproxy-GetUrlBodyTask';
$service_cost_usage->reference_1 = 'url_to_crawl';
$service_cost_usage->reference_2 = strval($url_to_crawl_id);
$service_cost_usage->output = self::getMarkdownFromHtml($raw_html);
$service_cost_usage->save();
} else {
$raw_html = null;
$response->throw();
}
} catch (Exception $e) {
$raw_html = null;
//throw $e;
}
if (! is_empty($raw_html)) {
$url_to_crawl->output_type = 'markdown';
$url_to_crawl->output = self::getMarkdownFromHtml($raw_html);
} else {
$url_to_crawl->output = 'EMPTY CONTENT';
$url_to_crawl->status = 'blocked';
}
$url_to_crawl->is_crawled = true;
if ($url_to_crawl->save()) {
if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
}
}
}
private static function getMainImageFromHtml($html)
{
$r_configuration = new ReadabilityConfiguration();
$r_configuration->setCharThreshold(20);
$readability = new Readability($r_configuration);
try {
$readability->parse($html);
return $readability->getImage();
//dd($readability);
} catch (ReadabilityParseException $e) {
}
return null;
}
private static function getMarkdownFromHtml($html)
{
$converter = new HtmlConverter([
'strip_tags' => true,
'strip_placeholder_links' => true,
]);
$html = self::cleanHtml($html);
$markdown = $converter->convert($html);
//dd($markdown);
$markdown = self::reverseLTGT($markdown);
$markdown = self::normalizeNewLines($markdown);
$markdown = self::removeDuplicateLines($markdown);
return html_entity_decode(markdown_to_plaintext($markdown));
}
private static function reverseLTGT($input)
{
$output = str_replace('&lt;', '<', $input);
$output = str_replace('&gt;', '>', $output);
return $output;
}
private static function removeDuplicateLines($string)
{
$lines = explode("\n", $string);
$uniqueLines = array_unique($lines);
return implode("\n", $uniqueLines);
}
private static function normalizeNewLines($content)
{
// Split the content by lines
$lines = explode("\n", $content);
$processedLines = [];
for ($i = 0; $i < count($lines); $i++) {
$line = trim($lines[$i]);
// If the line is an image markdown
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
// And if the next line is not empty and not another markdown structure
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
$line .= ' '.trim($lines[$i + 1]);
$i++; // Skip the next line as we're merging it
}
}
// Add line to processedLines if it's not empty
if (! empty($line)) {
$processedLines[] = $line;
}
}
// Collapse excessive newlines
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
// Detect and replace the pattern
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
// Replace multiple spaces with a dash separator
$result = preg_replace('/ {2,}/', ' - ', $result);
return $result;
}
private static function cleanHtml($htmlContent)
{
$crawler = new Crawler($htmlContent);
// Define tags to remove completely
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
foreach ($tagsToRemove as $tag) {
$crawler->filter($tag)->each(function ($node) {
foreach ($node as $child) {
$child->parentNode->removeChild($child);
}
});
}
// Replace <span> tags with their inner content
$crawler->filter('span')->each(function ($node) {
$replacement = new \DOMText($node->text());
foreach ($node as $child) {
$child->parentNode->replaceChild($replacement, $child);
}
});
return $crawler->outerHtml();
}
}

View File

@@ -0,0 +1,197 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Jobs\GetAIToolScreenshotJob;
use App\Jobs\ParseUrlBodyJob;
use App\Jobs\StoreSearchEmbeddingJob;
use App\Models\AiTool;
use App\Models\AiToolKeyword;
use App\Models\Category;
use App\Models\ServiceCostUsage;
use App\Models\UrlToCrawl;
use Exception;
class ParseUrlBodyTask
{
public static function handle(int $url_to_crawl_id)
{
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
$parent_categories = Category::whereNull('parent_id')->orderBy('name', 'ASC')->get();
if (is_null($url_to_crawl)) {
return;
}
if (in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
return;
}
if (is_empty($url_to_crawl->output)) {
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
}
$url_meta_response = null;
if (! is_null($url_to_crawl->metadata)) {
$url_meta_response = $url_to_crawl->metadata;
} else {
$url_meta_response = OpenAI::getSiteSummary($parent_categories, $url_to_crawl->output, 1536, 30, true);
if ((isset($url_meta_response->output)) && (! is_null($url_meta_response->output))) {
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $url_meta_response->cost;
$service_cost_usage->name = 'openai-getSiteSummary';
$service_cost_usage->reference_1 = 'url_to_crawl';
$service_cost_usage->reference_2 = strval($url_to_crawl->id);
$service_cost_usage->output = $url_meta_response;
$service_cost_usage->save();
}
}
if (is_null($url_meta_response->output)) {
throw new Exception('OpenAI::getSiteSummary failed. Empty object');
}
$url_to_crawl->metadata = $url_meta_response;
// Check AI Tool
$ai_tool = AiTool::where('url_to_crawl_id', $url_to_crawl->id)->first();
if (is_null($ai_tool)) {
$ai_tool = new AiTool;
$ai_tool->url_to_crawl_id = $url_to_crawl->id;
}
// Tool Name
if ((isset($url_meta_response->output->tool_name)) && (! is_empty($url_meta_response->output->tool_name))) {
$ai_tool->tool_name = $url_meta_response->output->tool_name;
} else {
throw new Exception('OpenAI::getSiteSummary failed, no tool name');
}
// Is AI Tool
if ((isset($url_meta_response->output->is_ai_tool)) && (! is_null($url_meta_response->output->is_at_tool)) && is_bool($url_meta_response->output->is_ai_tool)) {
$ai_tool->is_ai_tool = $url_meta_response->output->is_ai_tool;
} else {
$ai_tool->is_ai_tool = true;
}
// Is App/Web/Both
if ((isset($url_meta_response->output->is_app_web_both)) && (is_array($url_meta_response->output->is_app_web_both)) && in_array($url_meta_response->output->is_app_web_both, ['app', 'web', 'both'])) {
$ai_tool->is_app_web_both = $url_meta_response->output->is_app_web_both;
}
// Tagline
if ((isset($url_meta_response->output->tagline)) && (! is_empty($url_meta_response->output->tagline))) {
$ai_tool->tagline = $url_meta_response->output->tagline;
}
// Summary
if ((isset($url_meta_response->output->summary)) && (! is_empty($url_meta_response->output->summary))) {
$ai_tool->summary = $url_meta_response->output->summary;
}
// Pricing Type
if ((isset($url_meta_response->output->pricing_type)) && (is_array($url_meta_response->output->pricing_type)) && in_array($url_meta_response->output->pricing_type, ['Free', 'Free Trial', 'Freemium', 'Subscription', 'Usage Based'])) {
$ai_tool->pricing_type = $url_meta_response->output->pricing_type;
} else {
$ai_tool->pricing_type = 'Free';
}
// Category ID
$has_main_category_record = false;
$main_category = null;
if ((isset($url_meta_response->output->main_category)) && (! is_empty($url_meta_response->output->main_category))) {
$main_category = Category::where('name', $url_meta_response->output->main_category)->first();
}
if (is_null($main_category)) {
$main_category = Category::where('name', 'Productivity')->first();
}
$ai_tool->category_id = $main_category->id;
// Keyword
if ((isset($url_meta_response->output->keywords)) && (is_array($url_meta_response->output->keywords))) {
$ai_tool->keyword_string = implode(',', $url_meta_response->output->keywords);
}
// Q&A
if ((isset($url_meta_response->output->qna)) && (is_array($url_meta_response->output->qna))) {
$ai_tool->qna = $url_meta_response->output->qna;
}
if ($ai_tool->save()) {
$query = $ai_tool->tool_name;
if (!is_empty($ai_tool->tagline))
{
$query .= ": " . $ai_tool->tagline;
}
StoreSearchEmbeddingJob::dispatch(
'ai_tool',
$ai_tool->category_id,
$ai_tool->id,
$query
);
if (is_empty($ai_tool->screenshot_img)) {
GetAIToolScreenshotJob::dispatch($url_to_crawl->id, $ai_tool->id)->onQueue('default')->onConnection('default');
}
// Keyword
if ((isset($url_meta_response->output->keywords)) && (is_array($url_meta_response->output->keywords))) {
foreach ($url_meta_response->output->keywords as $keyword) {
$keyword_lowercased = strtolower(trim($keyword));
$ai_tool_keyword = AiToolKeyword::where('value_lowercased', $keyword_lowercased)
->where('ai_tool_id', $ai_tool->id)
->first();
if (is_null($ai_tool_keyword)) {
$ai_tool_keyword = new AiToolKeyword;
$ai_tool_keyword->category_id = $ai_tool->category_id;
$ai_tool_keyword->ai_tool_id = $ai_tool->id;
$ai_tool_keyword->value = trim($keyword);
$ai_tool_keyword->value_lowercased = $keyword_lowercased;
if ($ai_tool_keyword->save()) {
StoreSearchEmbeddingJob::dispatch(
'ai_tool_keyword',
$ai_tool->category_id,
$ai_tool->id,
$ai_tool_keyword->value
);
}
}
}
}
// Q&A
if ((isset($url_meta_response->output->qna)) && (is_array($url_meta_response->output->qna))) {
foreach ($url_meta_response->output->qna as $qna)
{
$q = $qna->q;
$a = $qna->a;
$value = "{$q} {$a}";
StoreSearchEmbeddingJob::dispatch(
'qna',
$ai_tool->category_id,
$ai_tool->id,
($qna->q . " " . $qna->a)
);
}
}
}
}
}

View File

@@ -1,355 +0,0 @@
<?php
namespace App\Jobs\Tasks;
use App\Models\ShopeeSellerScrapedImage;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;
use Intervention\Image\Facades\Image;
class SaveShopeeSellerImagesTask
{
public static function handle($shopee_task)
{
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
$costs = [];
$user_agent = config('platform.proxy.user_agent');
///////// PART 1
$main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld);
// if there is no main image intervention but the main image url is provided
if (! is_empty($main_image_url)) {
$scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
if (is_null($scraped_image)) {
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs);
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_image, true);
}
}
/////// PART 2
$images = self::getFilteredImages($shopee_task->product_task->response->raw_html, $rotating_proxy_server, $user_agent, $costs);
//dd($images);
if (! is_null($images) && is_array($images) && count($images) > 0) {
foreach ($images as $image_obj) {
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $image_obj->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
if (is_null($scraped_image)) {
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $image_obj, false);
}
}
}
//return ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->get();
}
private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $image_obj, $featured = false)
{
// Generate a unique filename for the uploaded file and LQIP version
$uuid = Str::uuid()->toString();
$fileName = time().'_'.$uuid.'.jpg';
$lqipFileName = time().'_'.$uuid.'_lqip.jpg';
// Convert the file to JPEG format using Intervention Image library
$image = $image_obj->intervention;
// Get the original image width and height
$originalWidth = $image->width();
$originalHeight = $image->height();
// Compress the image to reduce file size to 50%
$image->encode('jpg', 50);
// Save the processed image to the 'r2' storage driver under the 'uploads' directory
$filePath = 'uploads/'.$fileName;
$lqipFilePath = 'uploads/'.$lqipFileName;
Storage::disk('r2')->put($filePath, $image->stream()->detach());
// Save the original image to a temporary file and open it again
$tempImagePath = tempnam(sys_get_temp_dir(), 'temp_image');
file_put_contents($tempImagePath, $image_obj->intervention->encode());
$clonedImage = Image::make($tempImagePath);
// Create the LQIP version of the image using a small size while maintaining the aspect ratio
$lqipImage = $clonedImage->fit(10, 10, function ($constraint) {
$constraint->aspectRatio();
});
$lqipImage->encode('jpg', 5);
Storage::disk('r2')->put($lqipFilePath, $lqipImage->stream()->detach());
// Cleanup the temporary image file
unlink($tempImagePath);
// Get the final URL of the uploaded image (non-LQIP version)
$url = Storage::disk('r2')->url($filePath);
$scraped_image = new ShopeeSellerScrapedImage;
$scraped_image->shopee_seller_scrape_id = $shopee_seller_scrape->id;
$scraped_image->original_name = $image_obj->original_name;
$scraped_image->image = $url;
$scraped_image->featured = $featured;
if ($scraped_image->save()) {
return $scraped_image;
}
return null;
}
private static function getImageUrls(string $raw_html)
{
$images = [];
// Pattern for extracting src and alt attributes from img tags
$pattern = '/<img\s.*?(?:src=["\'](.*?)["\']).*?(?:alt=["\'](.*?)["\'])?[^>]*>/is';
if (preg_match_all($pattern, $raw_html, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$src = $match[1];
// Check if image file name ends with '_tn' and remove it
$src = preg_replace('/_tn(\.[a-z]+)?$/i', '$1', $src);
$images[] = [
'src' => $src,
'alt' => isset($match[2]) ? $match[2] : null,
];
}
}
return $images;
}
private static function getFilteredImages(string $raw_html, string $proxy, string $user_agent, &$costs)
{
$images = self::getImageUrls($raw_html);
//dd($images);
$filteredImages = [];
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
$count = 0;
foreach ($images as $image) {
$count++;
$src = $image['src'];
try {
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src);
// Check if the request was successful
if (! $response->successful()) {
continue;
}
$imageData = $response->body();
// Create an Intervention Image instance from the response data
$interventionImage = Image::make($imageData);
$width = $interventionImage->width();
$height = $interventionImage->height();
$mime = $interventionImage->mime();
// Image size in KB
$sizeKb = round(strlen($imageData) / 1024, 2);
// Check constraints
if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') {
continue;
}
if ($height > $width) {
continue;
}
$interventionImage->resize(800, null, function ($constraint) {
$constraint->aspectRatio();
});
$width = $interventionImage->width();
$height = $interventionImage->height();
$mime = $interventionImage->mime();
$image['width'] = $width;
$image['height'] = $height;
$image['mime'] = $mime;
$image['sizeKb'] = $sizeKb;
// Check for duplicates by searching through uniqueAttributes
$isDuplicate = false;
foreach ($uniqueAttributes as $attr) {
if (
$attr['width'] == $width &&
$attr['height'] == $height &&
$attr['mime'] == $mime &&
abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance
) {
$isDuplicate = true;
break;
}
}
if (! $isDuplicate) {
$uniqueAttributes[] = [
'width' => $width,
'height' => $height,
'mime' => $mime,
'sizeKb' => $sizeKb,
];
$image['color_counts'] = self::getImageColorCounts($interventionImage);
$image['intervention'] = $interventionImage;
$image['original_name'] = pathinfo($src, PATHINFO_BASENAME);
//$image['img'] = $interventionImage;
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb, 'rotating_global');
$filteredImages[] = $image;
}
} catch (\Exception $e) {
// Handle exceptions related to the HTTP request
continue;
}
}
// Collect all the color counts
$colorCounts = [];
foreach ($filteredImages as $image) {
$colorCounts[] = $image['color_counts'];
}
if (! empty($colorCounts)) {
// Compute the median of the color counts
sort($colorCounts);
$count = count($colorCounts);
$middleIndex = floor($count / 2);
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
// Use the median to filter out the low outliers
$threshold = 0.10 * $median; // Adjust this percentage as needed
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
return $image['color_counts'] > $threshold;
});
} else {
// No images found
$filteredImages = []; // Clear the array or take any other appropriate action
}
usort($filteredImages, function ($a, $b) {
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
});
$final_images = [];
foreach ($filteredImages as $image_obj) {
$final_images[] = (object) $image_obj;
}
return $final_images;
}
private static function getProductImageUrl(array $jsonLdData)
{
foreach ($jsonLdData as $data) {
// Ensure the type is "Product" before proceeding
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
if (isset($data->url)) {
return $data->url;
}
}
}
}
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs)
{
foreach ($jsonLdData as $data) {
// Ensure the type is "Product" before proceeding
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
if (isset($data->url) && isset($data->image)) {
try {
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
// Check if the request was successful
if ($response->successful()) {
$imageData = $response->body();
// Create an Intervention Image instance from the response data
$interventionImage = Image::make($imageData);
// Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed
$interventionImage->fit(1920, 1080, function ($constraint) {
$constraint->upsize();
$constraint->aspectRatio();
});
$sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes
// Calculate the cost
$cost = calculate_smartproxy_cost($sizeInKb, 'rotating_global');
$costs['product_image'] = $cost;
return (object) [
'url' => $data->url,
'intervention' => $interventionImage,
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
'cost' => $cost,
];
}
} catch (\Exception $e) {
// Handle exceptions related to the HTTP request
return null;
}
}
}
}
return null;
}
private static function getImageColorCounts($interventionImage)
{
// Use Intervention to manipulate the image
$img = clone $interventionImage;
// Resize to a smaller dimension for faster processing (maintaining aspect ratio)
$img->resize(200, null, function ($constraint) {
$constraint->aspectRatio();
});
// Apply some blur
$img->blur(10);
$im = imagecreatefromstring($img->encode());
$width = imagesx($im);
$height = imagesy($im);
$uniqueColors = [];
for ($x = 0; $x < $width; $x++) {
for ($y = 0; $y < $height; $y++) {
$rgb = imagecolorat($im, $x, $y);
$uniqueColors[$rgb] = true;
}
}
imagedestroy($im);
// Adjust the threshold based on your dataset.
// Here, I'm assuming that images with less than 100 unique colors are mostly text
// because we've reduced the image size and applied blurring.
return count($uniqueColors);
}
}

View File

@@ -1,133 +0,0 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Models\Category;
use App\Models\ShopeeSellerScrape;
use Exception;
class ShopeeSellerTopProductScraperTask
{
public static function handle(string $seller, string $country_iso, Category $category)
{
$country_iso = strtolower($country_iso);
if (is_empty($seller)) {
throw new Exception('Missing \'seller\' attribute.');
}
$shopee_seller_scrape = ShopeeSellerScrape::where('seller', $seller)
->where('country_iso', $country_iso)->first();
if (! is_null($shopee_seller_scrape)) {
$serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename);
if (! is_empty($serialised)) {
$obj = unserialize($serialised);
$obj->shopee_seller_scrape = $shopee_seller_scrape;
return $obj;
}
}
$epoch = epoch_now_timestamp();
$seller_shop_url = "https://shopee.com.my/{$seller}?page=0&sortBy=sales";
$seller_shop_task = UrlCrawlerTask::handle($seller_shop_url, 'shopee/seller', $epoch, true, false);
//dd($seller_shop_task);
if (isset($seller_shop_task->response->jsonld)) {
$top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 400);
if (count($top_rank_products) > 0) {
$product_found = null;
foreach ($top_rank_products as $product) {
$product_task = UrlCrawlerTask::handle($product->url, 'shopee/seller', $epoch, true, true);
if ($product_task->response->status_code >= 0) {
$product_found = $product_task->response;
break;
}
}
$scraped = (object) [
'seller_shop_task' => (object) [
'response' => $seller_shop_task->response,
],
'product_task' => (object) [
'response' => $product_task->response,
],
];
$serialised = serialize($scraped);
$filename = $seller.'-'.$epoch.'-'.$country_iso.'.txt';
OSSUploader::uploadFile('r2', 'shopee/seller', $filename, $serialised);
$shopee_seller_scrape = new ShopeeSellerScrape;
$shopee_seller_scrape->seller = $seller;
$shopee_seller_scrape->country_iso = $country_iso;
$shopee_seller_scrape->epoch = $epoch;
$shopee_seller_scrape->filename = $filename;
$shopee_seller_scrape->category_id = $category->id;
if ($shopee_seller_scrape->save()) {
return (object) compact('seller_shop_task', 'product_task', 'shopee_seller_scrape');
}
}
}
return null;
}
private static function getSortedData($data, $minValue)
{
// Filter the items of type "Product" with an offer price greater than 200
$filtered = array_filter($data, function ($item) use ($minValue) {
$isProduct = $item->{'@type'} === 'Product';
$lowPrice = floatval($item->offers?->lowPrice ?? 0);
$price = floatval($item->offers?->price ?? 0);
return $isProduct && ($lowPrice > $minValue) || ($price > $minValue);
});
// Sort the items based on `ratingCount` and `ratingValue` in descending order
usort($filtered, function ($a, $b) {
$ratingCountA = intval($a->aggregateRating?->ratingCount ?? 0);
$ratingCountB = intval($b->aggregateRating?->ratingCount ?? 0);
$ratingValueA = floatval($a->aggregateRating?->ratingValue ?? 0);
$ratingValueB = floatval($b->aggregateRating?->ratingValue ?? 0);
if ($ratingCountA !== $ratingCountB) {
return $ratingCountB - $ratingCountA;
}
return $ratingValueB <=> $ratingValueA;
});
// Map the filtered and sorted items to a new array of objects
return array_map(function ($item) {
return (object) [
'name' => $item->name ?? null,
'description' => $item->description ?? null,
'url' => $item->url ?? null,
'image' => $item->image ?? null,
'lowPrice' => floatval($item->offers?->lowPrice ?? 0),
'highPrice' => floatval($item->offers?->highPrice ?? 0),
'price' => floatval($item->offers?->price ?? 0),
'priceCurrency' => $item->offers?->priceCurrency ?? null,
'ratingCount' => intval($item->aggregateRating?->ratingCount ?? 0),
'ratingValue' => floatval($item->aggregateRating?->ratingValue ?? 0),
];
}, $filtered);
}
}

View File

@@ -1,236 +0,0 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use Exception;
use Illuminate\Support\Facades\Http;
use Spatie\Browsershot\Browsershot;
use Spatie\Browsershot\Exceptions\UnsuccessfulResponse;
use Symfony\Component\DomCrawler\Crawler;
class UrlCrawlerTask
{
public static function handle(string $url, $directory, $postfix = null, $strip_html = false, $parse_images = false)
{
$slug = str_slug($url);
$cached_url = $url; // self::getGoogleCachedUrl($url, false);
$postfix = strval($postfix);
$driver = 'r2';
$filename = $slug.'-'.$postfix.'.html';
$user_agent = config('platform.proxy.user_agent');
$disk_url = $directory.$filename;
$raw_html = null;
$status_code = 0;
$costs = [];
$unblocker_proxy_server = get_smartproxy_unblocker_server();
$rotating_proxy_server = get_smartproxy_rotating_server();
try {
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
if (is_null($raw_html)) {
$status_code = -1;
throw new Exception('Not stored.');
}
} catch (Exception $e) {
$raw_html = null;
}
if (is_null($raw_html)) {
try {
$response = Http::withHeaders([
'User-Agent' => $user_agent,
])
->withOptions([
'proxy' => $unblocker_proxy_server,
'timeout' => 1000,
'verify' => false,
])
->get($cached_url);
if ($response->successful()) {
$raw_html = $response->body();
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'unblocker');
} else {
$raw_html = null;
$status_code = -3;
$response->throw();
}
// $browsershot = new Browsershot();
// $browsershot->setUrl($cached_url)
// ->setOption('args', ['headless: "new"'])
// ->noSandbox()
// ->setOption('args', ['--disable-web-security'])
// ->userAgent($user_agent)
// ->ignoreHttpsErrors()
// ->preventUnsuccessfulResponse()
// ->timeout(10)
// ->setProxyServer($proxy_server)
// ->userAgent($user_agent);
// if (app()->environment() == 'local') {
// $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
// }
// //dump($browsershot);
// $raw_html = $browsershot->bodyHtml();
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
// $costs['html'] = $browsershot_cost;
} catch (UnsuccessfulResponse|Exception $e) {
$raw_html = null;
$status_code = -3;
throw $e;
}
if (! is_empty($raw_html)) {
OSSUploader::uploadFile($driver, $directory, $filename, $raw_html);
$status_code = 1;
}
}
if (! is_null($raw_html)) {
//$raw_html = self::minifyAndCleanHtml($raw_html);
$jsonld = self::getJsonLd($raw_html);
return (object) [
'response' => (object) [
'url' => $url,
'postfix' => $postfix,
'filename' => $disk_url,
'raw_html' => $raw_html,
'jsonld' => $jsonld,
'status_code' => $status_code,
'costs' => $costs,
'total_cost' => array_sum(array_values($costs)),
],
];
}
return (object) [
'response' => (object) [
'url' => $url,
'postfix' => $postfix,
'filename' => null,
'raw_html' => null,
'jsonld' => [],
'status_code' => $status_code,
'costs' => $costs,
'total_cost' => 0,
],
];
}
private static function getJsonLd(string $raw_html)
{
$crawler = new Crawler($raw_html);
try {
$jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) {
return $node->text();
});
} catch (Exception $e) {
return [];
}
$contents = [];
foreach ($jsonld as $content) {
try {
$contents[] = json_decode($content);
} catch (Exception $e) {
}
}
return $contents;
}
private static function minifyAndCleanHtml(string $raw_html)
{
$raw_html = self::minifyHTML($raw_html);
$crawler = new Crawler($raw_html);
// Directly loop through the DOM and remove 'class' and 'id' attributes
foreach ($crawler as $domElement) {
/** @var \DOMNodeList $nodes */
$nodes = $domElement->getElementsByTagName('*');
foreach ($nodes as $node) {
/** @var \DOMElement $node */
$node->removeAttribute('class');
$node->removeAttribute('id');
$node->removeAttribute('style');
}
}
// Remove <style> tags and their content
$styleTags = $domElement->getElementsByTagName('style');
for ($i = $styleTags->length; --$i >= 0;) {
$styleNode = $styleTags->item($i);
$styleNode->parentNode->removeChild($styleNode);
}
// Output the manipulated HTML
return $crawler->html();
}
private static function minifyHTML($input)
{
// Remove extra white space between HTML tags
$input = preg_replace('/>\s+</', '><', $input);
// Remove comments
$input = preg_replace('/<!--(.|\s)*?-->/', '', $input);
return $input;
}
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
{
$url = self::stripUrlQueryParameters($url);
$cached_url = "https://webcache.googleusercontent.com/search?q=cache:{$url}";
if ($stripHtml) {
$cached_url .= '&strip=1';
}
return $cached_url;
}
private static function stripUrlQueryParameters(string $url)
{
// Parse the URL into its components
$parts = parse_url($url);
// Rebuild the URL without the query component
$newUrl = $parts['scheme'].'://'.$parts['host'];
if (isset($parts['path'])) {
$newUrl .= $parts['path'];
}
if (isset($parts['fragment'])) {
$newUrl .= '#'.$parts['fragment'];
}
return $newUrl;
}
}