Sync
This commit is contained in:
37
app/Jobs/GetAIToolScreenshotJob.php
Normal file
37
app/Jobs/GetAIToolScreenshotJob.php
Normal file
@@ -0,0 +1,37 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Tasks\GetAIToolScreenshotTask;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Bus\Dispatchable;
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
|
||||
class GetAIToolScreenshotJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
protected $url_to_crawl_id;
|
||||
|
||||
protected $ai_tool_id;
|
||||
|
||||
/**
|
||||
* Create a new job instance.
|
||||
*/
|
||||
public function __construct($url_to_crawl_id, $ai_tool_id)
|
||||
{
|
||||
$this->url_to_crawl_id = $url_to_crawl_id;
|
||||
|
||||
$this->ai_tool_id = $ai_tool_id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the job.
|
||||
*/
|
||||
public function handle(): void
|
||||
{
|
||||
GetAIToolScreenshotTask::handle($this->url_to_crawl_id, $this->ai_tool_id);
|
||||
}
|
||||
}
|
||||
35
app/Jobs/GetUrlBodyJob.php
Normal file
35
app/Jobs/GetUrlBodyJob.php
Normal file
@@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Tasks\GetUrlBodyTask;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Bus\Dispatchable;
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
|
||||
class GetUrlBodyJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
protected $url_to_crawl_id;
|
||||
|
||||
public $timeout = 60;
|
||||
|
||||
/**
|
||||
* Create a new job instance.
|
||||
*/
|
||||
public function __construct($url_to_crawl_id)
|
||||
{
|
||||
$this->url_to_crawl_id = $url_to_crawl_id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the job.
|
||||
*/
|
||||
public function handle(): void
|
||||
{
|
||||
GetUrlBodyTask::handle($this->url_to_crawl_id);
|
||||
}
|
||||
}
|
||||
35
app/Jobs/ParseUrlBodyJob.php
Normal file
35
app/Jobs/ParseUrlBodyJob.php
Normal file
@@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Tasks\ParseUrlBodyTask;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Bus\Dispatchable;
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
|
||||
class ParseUrlBodyJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
protected $url_to_crawl_id;
|
||||
|
||||
public $timeout = 60;
|
||||
|
||||
/**
|
||||
* Create a new job instance.
|
||||
*/
|
||||
public function __construct($url_to_crawl_id)
|
||||
{
|
||||
$this->url_to_crawl_id = $url_to_crawl_id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the job.
|
||||
*/
|
||||
public function handle(): void
|
||||
{
|
||||
ParseUrlBodyTask::handle($this->url_to_crawl_id);
|
||||
}
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Tasks\GenerateShopeeAIArticleTask;
|
||||
use App\Jobs\Tasks\SaveShopeeSellerImagesTask;
|
||||
use App\Jobs\Tasks\ShopeeSellerTopProductScraperTask;
|
||||
use App\Models\Category;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Bus\Dispatchable;
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
|
||||
class ShopeeSellerTopProductScraperJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
public $timeout = 1000;
|
||||
|
||||
protected $seller;
|
||||
|
||||
protected $country_iso;
|
||||
|
||||
protected $category;
|
||||
|
||||
/**
|
||||
* Create a new job instance.
|
||||
*/
|
||||
public function __construct(string $seller, string $country_iso, Category $category)
|
||||
{
|
||||
$this->seller = $seller;
|
||||
|
||||
$this->country_iso = $country_iso;
|
||||
|
||||
$this->category = $category;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the job.
|
||||
*/
|
||||
public function handle(): void
|
||||
{
|
||||
$shopee_task = ShopeeSellerTopProductScraperTask::handle($this->seller, $this->country_iso, $this->category);
|
||||
|
||||
//dd($shopee_task->product_task);
|
||||
|
||||
if (! is_null($shopee_task)) {
|
||||
SaveShopeeSellerImagesTask::handle($shopee_task);
|
||||
|
||||
GenerateShopeeAIArticleTask::handle($shopee_task->shopee_seller_scrape);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
65
app/Jobs/StoreSearchEmbeddingJob.php
Normal file
65
app/Jobs/StoreSearchEmbeddingJob.php
Normal file
@@ -0,0 +1,65 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Helpers\FirstParty\Aictio\Aictio;
|
||||
use App\Models\SearchEmbedding;
|
||||
use Exception;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Bus\Dispatchable;
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
|
||||
class StoreSearchEmbeddingJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
protected $type;
|
||||
|
||||
protected $category_id;
|
||||
|
||||
protected $ai_tool_id;
|
||||
|
||||
protected $query;
|
||||
|
||||
/**
|
||||
* Create a new job instance.
|
||||
*/
|
||||
public function __construct($type, $category_id, $ai_tool_id, $query)
|
||||
{
|
||||
$this->type = $type;
|
||||
$this->category_id = $category_id;
|
||||
$this->ai_tool_id = $ai_tool_id;
|
||||
$this->query = $query;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the job.
|
||||
*/
|
||||
public function handle(): void
|
||||
{
|
||||
$embedding = Aictio::getVectorEmbedding(strtolower($this->query));
|
||||
|
||||
if (! is_null($embedding)) {
|
||||
|
||||
$search_embedding = SearchEmbedding::where('type', $this->type)
|
||||
->where('category_id', $this->category_id)
|
||||
->where('ai_tool_id', $this->ai_tool_id)
|
||||
->where('query', $this->query)
|
||||
->first();
|
||||
|
||||
if (is_null($search_embedding)) {
|
||||
$search_embedding = new SearchEmbedding;
|
||||
$search_embedding->type = $this->type;
|
||||
$search_embedding->category_id = $this->category_id;
|
||||
$search_embedding->ai_tool_id = $this->ai_tool_id;
|
||||
$search_embedding->query = $this->query;
|
||||
$search_embedding->embedding = $embedding;
|
||||
$search_embedding->save();
|
||||
}
|
||||
} else {
|
||||
throw new Exception('Failed vector embedding: '.$this->query);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,278 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Models\AiWriteup;
|
||||
use App\Models\Category;
|
||||
use App\Models\Post;
|
||||
use App\Models\PostCategory;
|
||||
use App\Models\ShopeeSellerCategory;
|
||||
use App\Models\ShopeeSellerScrape;
|
||||
use App\Models\ShopeeSellerScrapedImage;
|
||||
use Exception;
|
||||
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
|
||||
use fivefilters\Readability\ParseException as ReadabilityParseException;
|
||||
use fivefilters\Readability\Readability;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use LaravelFreelancerNL\LaravelIndexNow\Facades\IndexNow;
|
||||
use LaravelGoogleIndexing;
|
||||
use Masterminds\HTML5;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class GenerateShopeeAIArticleTask
|
||||
{
|
||||
public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
|
||||
{
|
||||
$serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename);
|
||||
|
||||
$post = null;
|
||||
|
||||
$shopee_seller_scrape->load('category');
|
||||
|
||||
if (! is_empty($serialised)) {
|
||||
$shopee_task = unserialize($serialised);
|
||||
$shopee_task->shopee_seller_scrape = $shopee_seller_scrape;
|
||||
}
|
||||
|
||||
// dd($shopee_task);
|
||||
|
||||
// dd($shopee_task->product_task->response);
|
||||
|
||||
$raw_html = $shopee_task->product_task->response->raw_html;
|
||||
|
||||
$excerpt = self::stripHtml($raw_html);
|
||||
|
||||
$excerpt = substr($excerpt, 0, 1500); // limit to 1500 (+1500 output token, total 3k token) characters due to OpenAI model limitations unless use 16k model, $$$$
|
||||
|
||||
$excerpt .= self::getProductPricingExcerpt($shopee_task->product_task->response->jsonld);
|
||||
|
||||
$photos = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', false)->orderByRaw('RAND()')->take(3)->get()->pluck('image')->toArray();
|
||||
|
||||
$ai_writeup = AiWriteup::where('source', 'shopee')->where('source_url', $shopee_task->product_task->response->url)->first();
|
||||
|
||||
if (is_null($ai_writeup)) {
|
||||
|
||||
$categories = [
|
||||
'Beauty',
|
||||
'Technology',
|
||||
'Home & Living',
|
||||
'Health',
|
||||
'Fitness',
|
||||
];
|
||||
|
||||
$ai_output = OpenAI::writeProductArticle($excerpt, $photos, $categories);
|
||||
|
||||
//dd($ai_output);
|
||||
|
||||
if (is_null($ai_output)) {
|
||||
$e = new Exception('Failed to write: Missing ai_output');
|
||||
|
||||
Log::error(serialize($ai_writeup?->toArray()));
|
||||
inspector()->reportException($e);
|
||||
throw ($e);
|
||||
} else {
|
||||
|
||||
$picked_category = Category::where('name', $ai_output->category)->where('country_locale_id', $shopee_seller_scrape->category->country_locale_id)->first();
|
||||
|
||||
if (is_null($picked_category)) {
|
||||
$picked_category = $shopee_seller_scrape->category;
|
||||
}
|
||||
|
||||
// save
|
||||
$ai_writeup = new AiWriteup;
|
||||
$ai_writeup->source = 'shopee';
|
||||
$ai_writeup->source_url = $shopee_task->product_task->response->url;
|
||||
$ai_writeup->category_id = $picked_category->id;
|
||||
$ai_writeup->title = $ai_output->title;
|
||||
$ai_writeup->excerpt = $ai_output->excerpt;
|
||||
$ai_writeup->featured_image = '';
|
||||
$ai_writeup->body = $ai_output->body;
|
||||
$ai_writeup->cost = self::getTotalServiceCost($shopee_task);
|
||||
$ai_writeup->editor_format = 'markdown';
|
||||
|
||||
if ($ai_writeup->save()) {
|
||||
$featured_photo = ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_seller_scrape->id)->where('featured', true)->first();
|
||||
|
||||
// new post
|
||||
$post_data = [
|
||||
'publish_date' => now(),
|
||||
'title' => $ai_writeup->title,
|
||||
'slug' => str_slug($ai_writeup->title),
|
||||
'excerpt' => $ai_writeup->excerpt,
|
||||
'cliffhanger' => $ai_writeup->cliffhanger,
|
||||
'author_id' => 1,
|
||||
'featured' => false,
|
||||
'featured_image' => $featured_photo->image,
|
||||
'editor' => 'markdown',
|
||||
'body' => $ai_writeup->body,
|
||||
'post_format' => 'standard',
|
||||
'status' => 'publish',
|
||||
];
|
||||
|
||||
$post = Post::create($post_data);
|
||||
|
||||
if (! is_null($post)) {
|
||||
|
||||
$shopee_seller_scrape->write_counts = $shopee_seller_scrape->write_counts + 1;
|
||||
$shopee_seller_scrape->last_ai_written_at = now();
|
||||
$shopee_seller_scrape->save();
|
||||
|
||||
$shopee_seller_category = ShopeeSellerCategory::where('seller', $shopee_seller_scrape->seller)->first();
|
||||
|
||||
if (is_null($shopee_seller_category)) {
|
||||
$shopee_seller_category = new ShopeeSellerCategory;
|
||||
$shopee_seller_category->seller = $shopee_seller_scrape->seller;
|
||||
$shopee_seller_category->category_id = $shopee_seller_scrape->category_id;
|
||||
}
|
||||
|
||||
$shopee_seller_category->last_ai_written_at = $shopee_seller_scrape->last_ai_written_at;
|
||||
$shopee_seller_category->write_counts = $shopee_seller_scrape->write_counts;
|
||||
|
||||
$shopee_seller_category->save();
|
||||
|
||||
PostCategory::create([
|
||||
'post_id' => $post->id,
|
||||
'category_id' => $picked_category->id,
|
||||
]);
|
||||
|
||||
if (app()->environment() == 'production') {
|
||||
if ($post->status == 'publish') {
|
||||
|
||||
$post_url = route('home.country.post', ['country' => $post->post_category?->category?->country_locale_slug, 'post_slug' => $post->slug]);
|
||||
|
||||
LaravelGoogleIndexing::create()->update($post_url);
|
||||
IndexNow::submit($post_url);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$e = new Exception('Failed to write: ai_writeup found');
|
||||
Log::error(serialize($ai_writeup?->toArray()));
|
||||
inspector()->reportException($e);
|
||||
throw ($e);
|
||||
}
|
||||
|
||||
return $post;
|
||||
}
|
||||
|
||||
private static function getProductPricingExcerpt(array $jsonLdData)
|
||||
{
|
||||
foreach ($jsonLdData as $data) {
|
||||
// Ensure the type is "Product" before proceeding
|
||||
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
||||
|
||||
// Extract necessary data
|
||||
$lowPrice = $data->offers->lowPrice ?? null;
|
||||
$highPrice = $data->offers->highPrice ?? null;
|
||||
$price = $data->offers->price ?? null;
|
||||
$currency = $data->offers->priceCurrency ?? null;
|
||||
$sellerName = $data->offers->seller->name ?? 'online store'; // default to "online store" if name is not set
|
||||
|
||||
if (! is_empty($currency)) {
|
||||
if ($currency == 'MYR') {
|
||||
$currency = 'RM';
|
||||
}
|
||||
}
|
||||
|
||||
// Determine and format pricing sentence
|
||||
if ($lowPrice && $highPrice) {
|
||||
$lowPrice = number_format($lowPrice, 0);
|
||||
$highPrice = number_format($highPrice, 0);
|
||||
|
||||
return "Price Range from {$currency} {$lowPrice} to {$highPrice} in {$sellerName} online store";
|
||||
} elseif ($price) {
|
||||
$price = number_format($price, 0);
|
||||
|
||||
return "Priced at {$currency} {$price} in {$sellerName} online store";
|
||||
} else {
|
||||
return "Price not stated, refer to {$sellerName} online store";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static function getTotalServiceCost($shopee_task)
|
||||
{
|
||||
|
||||
$cost = 0.00;
|
||||
|
||||
$cost += 0.09; // chatgpt-3.5-turbo $0.03 for 1k, writing for 2k tokens
|
||||
|
||||
// Shopee Seller Scraping
|
||||
if (isset($shopee_task?->seller_shop_task?->response?->total_cost)) {
|
||||
$cost += $shopee_task?->seller_shop_task?->response?->total_cost;
|
||||
}
|
||||
|
||||
// Shopee Product Scraping
|
||||
if (isset($shopee_task?->product_task?->response?->total_cost)) {
|
||||
$cost += $shopee_task?->product_task?->response?->total_cost;
|
||||
}
|
||||
|
||||
return $cost;
|
||||
|
||||
}
|
||||
|
||||
private static function stripHtml(string $raw_html)
|
||||
{
|
||||
|
||||
$html_content = '';
|
||||
|
||||
try {
|
||||
|
||||
$r_configuration = new ReadabilityConfiguration();
|
||||
$r_configuration->setCharThreshold(20);
|
||||
|
||||
$readability = new Readability($r_configuration);
|
||||
|
||||
$readability->parse($raw_html);
|
||||
|
||||
$temp_html_content = $readability->getContent();
|
||||
|
||||
// Remove tabs
|
||||
$temp_html_content = str_replace("\t", '', $temp_html_content);
|
||||
|
||||
// Replace newlines with spaces
|
||||
$temp_html_content = str_replace(["\n", "\r\n"], ' ', $temp_html_content);
|
||||
|
||||
// Replace multiple spaces with a single space
|
||||
$temp_html_content = preg_replace('/\s+/', ' ', $temp_html_content);
|
||||
|
||||
// Output the cleaned text
|
||||
$temp_html_content = trim($temp_html_content); // Using trim to remove any leading or trailing spaces
|
||||
|
||||
$temp_html_content = strip_tags($temp_html_content);
|
||||
|
||||
$crawler = new Crawler($raw_html);
|
||||
|
||||
// Extract meta title
|
||||
$title = $crawler->filter('title')->text(); // This assumes <title> tags are used for titles.
|
||||
|
||||
// Extract meta description
|
||||
$metaDescriptionNode = $crawler->filter('meta[name="description"]');
|
||||
$description = $metaDescriptionNode->count() > 0 ? $metaDescriptionNode->attr('content') : null;
|
||||
|
||||
$html_content .= $title.' ';
|
||||
$html_content .= $description.' ';
|
||||
$html_content .= $temp_html_content;
|
||||
|
||||
} catch (ReadabilityParseException|Exception $e) {
|
||||
|
||||
$html5 = new HTML5(['preserveWhiteSpace' => true]);
|
||||
|
||||
// Parse the HTML into a DOM tree.
|
||||
$dom = $html5->loadHTML($raw_html);
|
||||
|
||||
// Serialize the DOM tree back to a string, formatted.
|
||||
$html_content = strip_tags($html5->saveHTML($dom));
|
||||
|
||||
}
|
||||
|
||||
return $html_content;
|
||||
}
|
||||
}
|
||||
87
app/Jobs/Tasks/GetAIToolScreenshotTask.php
Normal file
87
app/Jobs/Tasks/GetAIToolScreenshotTask.php
Normal file
@@ -0,0 +1,87 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Models\AiTool;
|
||||
use App\Models\BusinessProfile;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\UrlToCrawl;
|
||||
use Exception;
|
||||
use Image;
|
||||
use Spatie\Browsershot\Browsershot;
|
||||
|
||||
class GetAIToolScreenshotTask
|
||||
{
|
||||
|
||||
public static function handle($url_to_crawl_id, $ai_tool_id)
|
||||
{
|
||||
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
|
||||
|
||||
if (is_null($url_to_crawl))
|
||||
{
|
||||
return ;
|
||||
}
|
||||
|
||||
$ai_tool = AiTool::find($ai_tool_id);
|
||||
|
||||
if (is_null($ai_tool))
|
||||
{
|
||||
return ;
|
||||
}
|
||||
|
||||
$userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36";
|
||||
|
||||
|
||||
$browsershot = Browsershot::url($url_to_crawl->url)
|
||||
->timeout(30)
|
||||
->emulateMedia('screen')
|
||||
->userAgent($userAgent)
|
||||
->windowSize(1024, 576) // Set window size
|
||||
->waitUntilNetworkIdle(); // Wait until all resources are loaded
|
||||
|
||||
if (app()->environment() == 'local') {
|
||||
$browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
|
||||
}
|
||||
|
||||
$retries = 0;
|
||||
$maxRetries = 1;
|
||||
|
||||
while ($retries < $maxRetries) {
|
||||
try {
|
||||
$image_content = $browsershot->screenshot();
|
||||
break; // Exit the loop if the screenshot method succeeds
|
||||
} catch (Exception $e) {
|
||||
$retries++;
|
||||
if ($retries === $maxRetries) {
|
||||
throw new Exception("Failed to take a screenshot after $maxRetries attempts: ".$e->getMessage(), 0, $e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$image = Image::make($image_content)
|
||||
->resize(1024, null, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
})
|
||||
->stream('webp', 80);
|
||||
|
||||
$image_file_name = str_slug($ai_tool->tool_name).'-'.epoch_now_timestamp().'.webp';
|
||||
|
||||
$upload_status = OSSUploader::uploadFile(
|
||||
config('platform.uploads.ai_tools.screenshot.driver'),
|
||||
config('platform.uploads.ai_tools.screenshot.path'),
|
||||
$image_file_name,
|
||||
$image);
|
||||
|
||||
if ($upload_status) {
|
||||
$ai_tool->screenshot_img = $image_file_name;
|
||||
}
|
||||
|
||||
if ($ai_tool->isDirty()) {
|
||||
$ai_tool->save();
|
||||
}
|
||||
|
||||
return $ai_tool;
|
||||
|
||||
}
|
||||
}
|
||||
204
app/Jobs/Tasks/GetUrlBodyTask.php
Normal file
204
app/Jobs/Tasks/GetUrlBodyTask.php
Normal file
@@ -0,0 +1,204 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Jobs\ParseUrlBodyJob;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use App\Models\UrlToCrawl;
|
||||
use Exception;
|
||||
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
|
||||
use fivefilters\Readability\ParseException as ReadabilityParseException;
|
||||
use fivefilters\Readability\Readability;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use League\HTMLToMarkdown\HtmlConverter;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class GetUrlBodyTask
|
||||
{
|
||||
public static function handle(int $url_to_crawl_id)
|
||||
{
|
||||
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
|
||||
|
||||
if (is_null($url_to_crawl)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$url_to_crawl->is_crawling = true;
|
||||
$url_to_crawl->save();
|
||||
$url_to_crawl->refresh();
|
||||
|
||||
try {
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
|
||||
$response = Http::withHeaders([
|
||||
'User-Agent' => $user_agent,
|
||||
])
|
||||
->withOptions([
|
||||
'proxy' => get_smartproxy_rotating_server(),
|
||||
'timeout' => 10,
|
||||
'verify' => false,
|
||||
])
|
||||
->get($url_to_crawl->url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$raw_html = $response->body();
|
||||
$cost = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $cost;
|
||||
$service_cost_usage->name = 'smartproxy-GetUrlBodyTask';
|
||||
$service_cost_usage->reference_1 = 'url_to_crawl';
|
||||
$service_cost_usage->reference_2 = strval($url_to_crawl_id);
|
||||
$service_cost_usage->output = self::getMarkdownFromHtml($raw_html);
|
||||
$service_cost_usage->save();
|
||||
|
||||
} else {
|
||||
$raw_html = null;
|
||||
$response->throw();
|
||||
}
|
||||
|
||||
} catch (Exception $e) {
|
||||
$raw_html = null;
|
||||
//throw $e;
|
||||
}
|
||||
|
||||
if (! is_empty($raw_html)) {
|
||||
$url_to_crawl->output_type = 'markdown';
|
||||
$url_to_crawl->output = self::getMarkdownFromHtml($raw_html);
|
||||
|
||||
} else {
|
||||
$url_to_crawl->output = 'EMPTY CONTENT';
|
||||
$url_to_crawl->status = 'blocked';
|
||||
}
|
||||
|
||||
$url_to_crawl->is_crawled = true;
|
||||
|
||||
if ($url_to_crawl->save()) {
|
||||
if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
|
||||
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static function getMainImageFromHtml($html)
|
||||
{
|
||||
$r_configuration = new ReadabilityConfiguration();
|
||||
$r_configuration->setCharThreshold(20);
|
||||
|
||||
$readability = new Readability($r_configuration);
|
||||
|
||||
try {
|
||||
$readability->parse($html);
|
||||
|
||||
return $readability->getImage();
|
||||
//dd($readability);
|
||||
} catch (ReadabilityParseException $e) {
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getMarkdownFromHtml($html)
|
||||
{
|
||||
|
||||
$converter = new HtmlConverter([
|
||||
'strip_tags' => true,
|
||||
'strip_placeholder_links' => true,
|
||||
]);
|
||||
|
||||
$html = self::cleanHtml($html);
|
||||
|
||||
$markdown = $converter->convert($html);
|
||||
|
||||
//dd($markdown);
|
||||
|
||||
$markdown = self::reverseLTGT($markdown);
|
||||
|
||||
$markdown = self::normalizeNewLines($markdown);
|
||||
|
||||
$markdown = self::removeDuplicateLines($markdown);
|
||||
|
||||
return html_entity_decode(markdown_to_plaintext($markdown));
|
||||
}
|
||||
|
||||
private static function reverseLTGT($input)
|
||||
{
|
||||
$output = str_replace('<', '<', $input);
|
||||
$output = str_replace('>', '>', $output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
private static function removeDuplicateLines($string)
|
||||
{
|
||||
$lines = explode("\n", $string);
|
||||
$uniqueLines = array_unique($lines);
|
||||
|
||||
return implode("\n", $uniqueLines);
|
||||
}
|
||||
|
||||
private static function normalizeNewLines($content)
|
||||
{
|
||||
// Split the content by lines
|
||||
$lines = explode("\n", $content);
|
||||
|
||||
$processedLines = [];
|
||||
|
||||
for ($i = 0; $i < count($lines); $i++) {
|
||||
$line = trim($lines[$i]);
|
||||
|
||||
// If the line is an image markdown
|
||||
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
|
||||
// And if the next line is not empty and not another markdown structure
|
||||
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
|
||||
$line .= ' '.trim($lines[$i + 1]);
|
||||
$i++; // Skip the next line as we're merging it
|
||||
}
|
||||
}
|
||||
|
||||
// Add line to processedLines if it's not empty
|
||||
if (! empty($line)) {
|
||||
$processedLines[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse excessive newlines
|
||||
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
|
||||
|
||||
// Detect and replace the pattern
|
||||
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
|
||||
|
||||
// Replace multiple spaces with a dash separator
|
||||
$result = preg_replace('/ {2,}/', ' - ', $result);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private static function cleanHtml($htmlContent)
|
||||
{
|
||||
$crawler = new Crawler($htmlContent);
|
||||
|
||||
// Define tags to remove completely
|
||||
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
|
||||
|
||||
foreach ($tagsToRemove as $tag) {
|
||||
$crawler->filter($tag)->each(function ($node) {
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->removeChild($child);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Replace <span> tags with their inner content
|
||||
$crawler->filter('span')->each(function ($node) {
|
||||
$replacement = new \DOMText($node->text());
|
||||
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->replaceChild($replacement, $child);
|
||||
}
|
||||
});
|
||||
|
||||
return $crawler->outerHtml();
|
||||
}
|
||||
}
|
||||
197
app/Jobs/Tasks/ParseUrlBodyTask.php
Normal file
197
app/Jobs/Tasks/ParseUrlBodyTask.php
Normal file
@@ -0,0 +1,197 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Jobs\GetAIToolScreenshotJob;
|
||||
use App\Jobs\ParseUrlBodyJob;
|
||||
use App\Jobs\StoreSearchEmbeddingJob;
|
||||
use App\Models\AiTool;
|
||||
use App\Models\AiToolKeyword;
|
||||
use App\Models\Category;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use App\Models\UrlToCrawl;
|
||||
use Exception;
|
||||
|
||||
class ParseUrlBodyTask
|
||||
{
|
||||
public static function handle(int $url_to_crawl_id)
|
||||
{
|
||||
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
|
||||
|
||||
$parent_categories = Category::whereNull('parent_id')->orderBy('name', 'ASC')->get();
|
||||
|
||||
if (is_null($url_to_crawl)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_empty($url_to_crawl->output)) {
|
||||
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
|
||||
$url_meta_response = null;
|
||||
|
||||
if (! is_null($url_to_crawl->metadata)) {
|
||||
$url_meta_response = $url_to_crawl->metadata;
|
||||
} else {
|
||||
$url_meta_response = OpenAI::getSiteSummary($parent_categories, $url_to_crawl->output, 1536, 30, true);
|
||||
|
||||
if ((isset($url_meta_response->output)) && (! is_null($url_meta_response->output))) {
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $url_meta_response->cost;
|
||||
$service_cost_usage->name = 'openai-getSiteSummary';
|
||||
$service_cost_usage->reference_1 = 'url_to_crawl';
|
||||
$service_cost_usage->reference_2 = strval($url_to_crawl->id);
|
||||
$service_cost_usage->output = $url_meta_response;
|
||||
$service_cost_usage->save();
|
||||
}
|
||||
}
|
||||
|
||||
if (is_null($url_meta_response->output)) {
|
||||
throw new Exception('OpenAI::getSiteSummary failed. Empty object');
|
||||
}
|
||||
|
||||
$url_to_crawl->metadata = $url_meta_response;
|
||||
|
||||
// Check AI Tool
|
||||
$ai_tool = AiTool::where('url_to_crawl_id', $url_to_crawl->id)->first();
|
||||
|
||||
if (is_null($ai_tool)) {
|
||||
$ai_tool = new AiTool;
|
||||
$ai_tool->url_to_crawl_id = $url_to_crawl->id;
|
||||
}
|
||||
|
||||
// Tool Name
|
||||
if ((isset($url_meta_response->output->tool_name)) && (! is_empty($url_meta_response->output->tool_name))) {
|
||||
$ai_tool->tool_name = $url_meta_response->output->tool_name;
|
||||
} else {
|
||||
throw new Exception('OpenAI::getSiteSummary failed, no tool name');
|
||||
}
|
||||
|
||||
// Is AI Tool
|
||||
if ((isset($url_meta_response->output->is_ai_tool)) && (! is_null($url_meta_response->output->is_at_tool)) && is_bool($url_meta_response->output->is_ai_tool)) {
|
||||
$ai_tool->is_ai_tool = $url_meta_response->output->is_ai_tool;
|
||||
} else {
|
||||
$ai_tool->is_ai_tool = true;
|
||||
}
|
||||
|
||||
// Is App/Web/Both
|
||||
if ((isset($url_meta_response->output->is_app_web_both)) && (is_array($url_meta_response->output->is_app_web_both)) && in_array($url_meta_response->output->is_app_web_both, ['app', 'web', 'both'])) {
|
||||
$ai_tool->is_app_web_both = $url_meta_response->output->is_app_web_both;
|
||||
}
|
||||
|
||||
// Tagline
|
||||
if ((isset($url_meta_response->output->tagline)) && (! is_empty($url_meta_response->output->tagline))) {
|
||||
$ai_tool->tagline = $url_meta_response->output->tagline;
|
||||
}
|
||||
|
||||
// Summary
|
||||
if ((isset($url_meta_response->output->summary)) && (! is_empty($url_meta_response->output->summary))) {
|
||||
$ai_tool->summary = $url_meta_response->output->summary;
|
||||
}
|
||||
|
||||
// Pricing Type
|
||||
if ((isset($url_meta_response->output->pricing_type)) && (is_array($url_meta_response->output->pricing_type)) && in_array($url_meta_response->output->pricing_type, ['Free', 'Free Trial', 'Freemium', 'Subscription', 'Usage Based'])) {
|
||||
$ai_tool->pricing_type = $url_meta_response->output->pricing_type;
|
||||
} else {
|
||||
$ai_tool->pricing_type = 'Free';
|
||||
}
|
||||
|
||||
// Category ID
|
||||
$has_main_category_record = false;
|
||||
|
||||
$main_category = null;
|
||||
|
||||
if ((isset($url_meta_response->output->main_category)) && (! is_empty($url_meta_response->output->main_category))) {
|
||||
$main_category = Category::where('name', $url_meta_response->output->main_category)->first();
|
||||
}
|
||||
|
||||
if (is_null($main_category)) {
|
||||
$main_category = Category::where('name', 'Productivity')->first();
|
||||
}
|
||||
|
||||
$ai_tool->category_id = $main_category->id;
|
||||
|
||||
// Keyword
|
||||
if ((isset($url_meta_response->output->keywords)) && (is_array($url_meta_response->output->keywords))) {
|
||||
$ai_tool->keyword_string = implode(',', $url_meta_response->output->keywords);
|
||||
}
|
||||
|
||||
// Q&A
|
||||
if ((isset($url_meta_response->output->qna)) && (is_array($url_meta_response->output->qna))) {
|
||||
$ai_tool->qna = $url_meta_response->output->qna;
|
||||
}
|
||||
|
||||
if ($ai_tool->save()) {
|
||||
|
||||
$query = $ai_tool->tool_name;
|
||||
|
||||
if (!is_empty($ai_tool->tagline))
|
||||
{
|
||||
$query .= ": " . $ai_tool->tagline;
|
||||
}
|
||||
|
||||
StoreSearchEmbeddingJob::dispatch(
|
||||
'ai_tool',
|
||||
$ai_tool->category_id,
|
||||
$ai_tool->id,
|
||||
$query
|
||||
);
|
||||
|
||||
if (is_empty($ai_tool->screenshot_img)) {
|
||||
GetAIToolScreenshotJob::dispatch($url_to_crawl->id, $ai_tool->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
|
||||
// Keyword
|
||||
if ((isset($url_meta_response->output->keywords)) && (is_array($url_meta_response->output->keywords))) {
|
||||
foreach ($url_meta_response->output->keywords as $keyword) {
|
||||
$keyword_lowercased = strtolower(trim($keyword));
|
||||
|
||||
$ai_tool_keyword = AiToolKeyword::where('value_lowercased', $keyword_lowercased)
|
||||
->where('ai_tool_id', $ai_tool->id)
|
||||
->first();
|
||||
|
||||
if (is_null($ai_tool_keyword)) {
|
||||
$ai_tool_keyword = new AiToolKeyword;
|
||||
$ai_tool_keyword->category_id = $ai_tool->category_id;
|
||||
$ai_tool_keyword->ai_tool_id = $ai_tool->id;
|
||||
$ai_tool_keyword->value = trim($keyword);
|
||||
$ai_tool_keyword->value_lowercased = $keyword_lowercased;
|
||||
|
||||
if ($ai_tool_keyword->save()) {
|
||||
StoreSearchEmbeddingJob::dispatch(
|
||||
'ai_tool_keyword',
|
||||
$ai_tool->category_id,
|
||||
$ai_tool->id,
|
||||
$ai_tool_keyword->value
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Q&A
|
||||
if ((isset($url_meta_response->output->qna)) && (is_array($url_meta_response->output->qna))) {
|
||||
foreach ($url_meta_response->output->qna as $qna)
|
||||
{
|
||||
$q = $qna->q;
|
||||
$a = $qna->a;
|
||||
|
||||
$value = "{$q} {$a}";
|
||||
|
||||
StoreSearchEmbeddingJob::dispatch(
|
||||
'qna',
|
||||
$ai_tool->category_id,
|
||||
$ai_tool->id,
|
||||
($qna->q . " " . $qna->a)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -1,355 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Models\ShopeeSellerScrapedImage;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Illuminate\Support\Facades\Storage;
|
||||
use Illuminate\Support\Str;
|
||||
use Intervention\Image\Facades\Image;
|
||||
|
||||
class SaveShopeeSellerImagesTask
|
||||
{
|
||||
public static function handle($shopee_task)
|
||||
{
|
||||
|
||||
$unblocker_proxy_server = get_smartproxy_unblocker_server();
|
||||
$rotating_proxy_server = get_smartproxy_rotating_server();
|
||||
$costs = [];
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
|
||||
///////// PART 1
|
||||
$main_image_url = self::getProductImageUrl($shopee_task->product_task->response->jsonld);
|
||||
|
||||
// if there is no main image intervention but the main image url is provided
|
||||
if (! is_empty($main_image_url)) {
|
||||
$scraped_image = ShopeeSellerScrapedImage::where('original_name', pathinfo($main_image_url, PATHINFO_BASENAME))->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
|
||||
|
||||
if (is_null($scraped_image)) {
|
||||
$main_image = self::getProductImage($shopee_task->product_task->response->jsonld, $rotating_proxy_server, $user_agent, $costs);
|
||||
|
||||
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $main_image, true);
|
||||
}
|
||||
}
|
||||
|
||||
/////// PART 2
|
||||
|
||||
$images = self::getFilteredImages($shopee_task->product_task->response->raw_html, $rotating_proxy_server, $user_agent, $costs);
|
||||
|
||||
//dd($images);
|
||||
|
||||
if (! is_null($images) && is_array($images) && count($images) > 0) {
|
||||
foreach ($images as $image_obj) {
|
||||
$scraped_image = ShopeeSellerScrapedImage::where('original_name', $image_obj->original_name)->where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->first();
|
||||
|
||||
if (is_null($scraped_image)) {
|
||||
$scraped_image = self::uploadAndSaveScrapedImage($shopee_task->shopee_seller_scrape, $image_obj, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//return ShopeeSellerScrapedImage::where('shopee_seller_scrape_id', $shopee_task->shopee_seller_scrape->id)->get();
|
||||
|
||||
}
|
||||
|
||||
private static function uploadAndSaveScrapedImage($shopee_seller_scrape, $image_obj, $featured = false)
|
||||
{
|
||||
// Generate a unique filename for the uploaded file and LQIP version
|
||||
$uuid = Str::uuid()->toString();
|
||||
$fileName = time().'_'.$uuid.'.jpg';
|
||||
$lqipFileName = time().'_'.$uuid.'_lqip.jpg';
|
||||
|
||||
// Convert the file to JPEG format using Intervention Image library
|
||||
$image = $image_obj->intervention;
|
||||
|
||||
// Get the original image width and height
|
||||
$originalWidth = $image->width();
|
||||
$originalHeight = $image->height();
|
||||
|
||||
// Compress the image to reduce file size to 50%
|
||||
$image->encode('jpg', 50);
|
||||
|
||||
// Save the processed image to the 'r2' storage driver under the 'uploads' directory
|
||||
$filePath = 'uploads/'.$fileName;
|
||||
$lqipFilePath = 'uploads/'.$lqipFileName;
|
||||
Storage::disk('r2')->put($filePath, $image->stream()->detach());
|
||||
|
||||
// Save the original image to a temporary file and open it again
|
||||
$tempImagePath = tempnam(sys_get_temp_dir(), 'temp_image');
|
||||
file_put_contents($tempImagePath, $image_obj->intervention->encode());
|
||||
$clonedImage = Image::make($tempImagePath);
|
||||
|
||||
// Create the LQIP version of the image using a small size while maintaining the aspect ratio
|
||||
$lqipImage = $clonedImage->fit(10, 10, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
$lqipImage->encode('jpg', 5);
|
||||
Storage::disk('r2')->put($lqipFilePath, $lqipImage->stream()->detach());
|
||||
|
||||
// Cleanup the temporary image file
|
||||
unlink($tempImagePath);
|
||||
|
||||
// Get the final URL of the uploaded image (non-LQIP version)
|
||||
$url = Storage::disk('r2')->url($filePath);
|
||||
|
||||
$scraped_image = new ShopeeSellerScrapedImage;
|
||||
$scraped_image->shopee_seller_scrape_id = $shopee_seller_scrape->id;
|
||||
$scraped_image->original_name = $image_obj->original_name;
|
||||
$scraped_image->image = $url;
|
||||
$scraped_image->featured = $featured;
|
||||
|
||||
if ($scraped_image->save()) {
|
||||
return $scraped_image;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getImageUrls(string $raw_html)
|
||||
{
|
||||
$images = [];
|
||||
|
||||
// Pattern for extracting src and alt attributes from img tags
|
||||
$pattern = '/<img\s.*?(?:src=["\'](.*?)["\']).*?(?:alt=["\'](.*?)["\'])?[^>]*>/is';
|
||||
|
||||
if (preg_match_all($pattern, $raw_html, $matches, PREG_SET_ORDER)) {
|
||||
foreach ($matches as $match) {
|
||||
$src = $match[1];
|
||||
|
||||
// Check if image file name ends with '_tn' and remove it
|
||||
$src = preg_replace('/_tn(\.[a-z]+)?$/i', '$1', $src);
|
||||
|
||||
$images[] = [
|
||||
'src' => $src,
|
||||
'alt' => isset($match[2]) ? $match[2] : null,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
private static function getFilteredImages(string $raw_html, string $proxy, string $user_agent, &$costs)
|
||||
{
|
||||
$images = self::getImageUrls($raw_html);
|
||||
|
||||
//dd($images);
|
||||
|
||||
$filteredImages = [];
|
||||
$uniqueAttributes = []; // This array will track unique width, height, mime, and size combinations
|
||||
|
||||
$count = 0;
|
||||
|
||||
foreach ($images as $image) {
|
||||
$count++;
|
||||
|
||||
$src = $image['src'];
|
||||
|
||||
try {
|
||||
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($src);
|
||||
|
||||
// Check if the request was successful
|
||||
if (! $response->successful()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$imageData = $response->body();
|
||||
|
||||
// Create an Intervention Image instance from the response data
|
||||
$interventionImage = Image::make($imageData);
|
||||
|
||||
$width = $interventionImage->width();
|
||||
$height = $interventionImage->height();
|
||||
$mime = $interventionImage->mime();
|
||||
|
||||
// Image size in KB
|
||||
$sizeKb = round(strlen($imageData) / 1024, 2);
|
||||
|
||||
// Check constraints
|
||||
if ($width < 800 || $height < 800 || $sizeKb < 100 || $mime !== 'image/jpeg') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($height > $width) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$interventionImage->resize(800, null, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
$width = $interventionImage->width();
|
||||
$height = $interventionImage->height();
|
||||
$mime = $interventionImage->mime();
|
||||
|
||||
$image['width'] = $width;
|
||||
$image['height'] = $height;
|
||||
$image['mime'] = $mime;
|
||||
$image['sizeKb'] = $sizeKb;
|
||||
|
||||
// Check for duplicates by searching through uniqueAttributes
|
||||
$isDuplicate = false;
|
||||
foreach ($uniqueAttributes as $attr) {
|
||||
if (
|
||||
$attr['width'] == $width &&
|
||||
$attr['height'] == $height &&
|
||||
$attr['mime'] == $mime &&
|
||||
abs($attr['sizeKb'] - $sizeKb) <= 30 // Check for size within a +/- 10kB tolerance
|
||||
) {
|
||||
$isDuplicate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (! $isDuplicate) {
|
||||
$uniqueAttributes[] = [
|
||||
'width' => $width,
|
||||
'height' => $height,
|
||||
'mime' => $mime,
|
||||
'sizeKb' => $sizeKb,
|
||||
];
|
||||
$image['color_counts'] = self::getImageColorCounts($interventionImage);
|
||||
|
||||
$image['intervention'] = $interventionImage;
|
||||
$image['original_name'] = pathinfo($src, PATHINFO_BASENAME);
|
||||
|
||||
//$image['img'] = $interventionImage;
|
||||
$costs['count-'.$count] = calculate_smartproxy_cost($sizeKb, 'rotating_global');
|
||||
|
||||
$filteredImages[] = $image;
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// Handle exceptions related to the HTTP request
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all the color counts
|
||||
$colorCounts = [];
|
||||
foreach ($filteredImages as $image) {
|
||||
$colorCounts[] = $image['color_counts'];
|
||||
}
|
||||
|
||||
if (! empty($colorCounts)) {
|
||||
// Compute the median of the color counts
|
||||
sort($colorCounts);
|
||||
$count = count($colorCounts);
|
||||
$middleIndex = floor($count / 2);
|
||||
$median = $count % 2 === 0 ? ($colorCounts[$middleIndex - 1] + $colorCounts[$middleIndex]) / 2 : $colorCounts[$middleIndex];
|
||||
|
||||
// Use the median to filter out the low outliers
|
||||
$threshold = 0.10 * $median; // Adjust this percentage as needed
|
||||
$filteredImages = array_filter($filteredImages, function ($image) use ($threshold) {
|
||||
return $image['color_counts'] > $threshold;
|
||||
});
|
||||
} else {
|
||||
// No images found
|
||||
$filteredImages = []; // Clear the array or take any other appropriate action
|
||||
}
|
||||
|
||||
usort($filteredImages, function ($a, $b) {
|
||||
return $b['sizeKb'] <=> $a['sizeKb']; // Using the spaceship operator to sort in descending order
|
||||
});
|
||||
|
||||
$final_images = [];
|
||||
|
||||
foreach ($filteredImages as $image_obj) {
|
||||
$final_images[] = (object) $image_obj;
|
||||
}
|
||||
|
||||
return $final_images;
|
||||
}
|
||||
|
||||
private static function getProductImageUrl(array $jsonLdData)
|
||||
{
|
||||
foreach ($jsonLdData as $data) {
|
||||
// Ensure the type is "Product" before proceeding
|
||||
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
||||
if (isset($data->url)) {
|
||||
return $data->url;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static function getProductImage(array $jsonLdData, string $proxy, string $user_agent, &$costs)
|
||||
{
|
||||
foreach ($jsonLdData as $data) {
|
||||
// Ensure the type is "Product" before proceeding
|
||||
if (isset($data->{'@type'}) && $data->{'@type'} === 'Product') {
|
||||
if (isset($data->url) && isset($data->image)) {
|
||||
try {
|
||||
$response = Http::withOptions(['proxy' => $proxy, 'verify' => false])->withHeaders(['User-Agent' => $user_agent])->get($data->image);
|
||||
|
||||
// Check if the request was successful
|
||||
if ($response->successful()) {
|
||||
$imageData = $response->body();
|
||||
|
||||
// Create an Intervention Image instance from the response data
|
||||
$interventionImage = Image::make($imageData);
|
||||
|
||||
// Resize/upscale the image to 1920x1080 maintaining the aspect ratio and cropping if needed
|
||||
$interventionImage->fit(1920, 1080, function ($constraint) {
|
||||
$constraint->upsize();
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
|
||||
$sizeInKb = strlen($interventionImage->encode()) / 1024; // Convert bytes to kilobytes
|
||||
|
||||
// Calculate the cost
|
||||
$cost = calculate_smartproxy_cost($sizeInKb, 'rotating_global');
|
||||
|
||||
$costs['product_image'] = $cost;
|
||||
|
||||
return (object) [
|
||||
'url' => $data->url,
|
||||
'intervention' => $interventionImage,
|
||||
'original_name' => pathinfo($data->image, PATHINFO_BASENAME),
|
||||
'cost' => $cost,
|
||||
];
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// Handle exceptions related to the HTTP request
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getImageColorCounts($interventionImage)
|
||||
{
|
||||
// Use Intervention to manipulate the image
|
||||
$img = clone $interventionImage;
|
||||
|
||||
// Resize to a smaller dimension for faster processing (maintaining aspect ratio)
|
||||
$img->resize(200, null, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
|
||||
// Apply some blur
|
||||
$img->blur(10);
|
||||
|
||||
$im = imagecreatefromstring($img->encode());
|
||||
|
||||
$width = imagesx($im);
|
||||
$height = imagesy($im);
|
||||
|
||||
$uniqueColors = [];
|
||||
|
||||
for ($x = 0; $x < $width; $x++) {
|
||||
for ($y = 0; $y < $height; $y++) {
|
||||
$rgb = imagecolorat($im, $x, $y);
|
||||
$uniqueColors[$rgb] = true;
|
||||
}
|
||||
}
|
||||
|
||||
imagedestroy($im);
|
||||
|
||||
// Adjust the threshold based on your dataset.
|
||||
// Here, I'm assuming that images with less than 100 unique colors are mostly text
|
||||
// because we've reduced the image size and applied blurring.
|
||||
return count($uniqueColors);
|
||||
}
|
||||
}
|
||||
@@ -1,133 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Models\Category;
|
||||
use App\Models\ShopeeSellerScrape;
|
||||
use Exception;
|
||||
|
||||
class ShopeeSellerTopProductScraperTask
|
||||
{
|
||||
public static function handle(string $seller, string $country_iso, Category $category)
|
||||
{
|
||||
|
||||
$country_iso = strtolower($country_iso);
|
||||
|
||||
if (is_empty($seller)) {
|
||||
throw new Exception('Missing \'seller\' attribute.');
|
||||
}
|
||||
|
||||
$shopee_seller_scrape = ShopeeSellerScrape::where('seller', $seller)
|
||||
->where('country_iso', $country_iso)->first();
|
||||
|
||||
if (! is_null($shopee_seller_scrape)) {
|
||||
$serialised = OSSUploader::readFile('r2', 'shopee/seller', $shopee_seller_scrape->filename);
|
||||
|
||||
if (! is_empty($serialised)) {
|
||||
$obj = unserialize($serialised);
|
||||
$obj->shopee_seller_scrape = $shopee_seller_scrape;
|
||||
|
||||
return $obj;
|
||||
}
|
||||
}
|
||||
|
||||
$epoch = epoch_now_timestamp();
|
||||
|
||||
$seller_shop_url = "https://shopee.com.my/{$seller}?page=0&sortBy=sales";
|
||||
|
||||
$seller_shop_task = UrlCrawlerTask::handle($seller_shop_url, 'shopee/seller', $epoch, true, false);
|
||||
|
||||
//dd($seller_shop_task);
|
||||
|
||||
if (isset($seller_shop_task->response->jsonld)) {
|
||||
$top_rank_products = self::getSortedData($seller_shop_task->response->jsonld, 400);
|
||||
|
||||
if (count($top_rank_products) > 0) {
|
||||
|
||||
$product_found = null;
|
||||
|
||||
foreach ($top_rank_products as $product) {
|
||||
$product_task = UrlCrawlerTask::handle($product->url, 'shopee/seller', $epoch, true, true);
|
||||
|
||||
if ($product_task->response->status_code >= 0) {
|
||||
$product_found = $product_task->response;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$scraped = (object) [
|
||||
'seller_shop_task' => (object) [
|
||||
'response' => $seller_shop_task->response,
|
||||
],
|
||||
'product_task' => (object) [
|
||||
'response' => $product_task->response,
|
||||
],
|
||||
];
|
||||
|
||||
$serialised = serialize($scraped);
|
||||
|
||||
$filename = $seller.'-'.$epoch.'-'.$country_iso.'.txt';
|
||||
|
||||
OSSUploader::uploadFile('r2', 'shopee/seller', $filename, $serialised);
|
||||
|
||||
$shopee_seller_scrape = new ShopeeSellerScrape;
|
||||
$shopee_seller_scrape->seller = $seller;
|
||||
$shopee_seller_scrape->country_iso = $country_iso;
|
||||
$shopee_seller_scrape->epoch = $epoch;
|
||||
$shopee_seller_scrape->filename = $filename;
|
||||
$shopee_seller_scrape->category_id = $category->id;
|
||||
|
||||
if ($shopee_seller_scrape->save()) {
|
||||
return (object) compact('seller_shop_task', 'product_task', 'shopee_seller_scrape');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getSortedData($data, $minValue)
|
||||
{
|
||||
// Filter the items of type "Product" with an offer price greater than 200
|
||||
$filtered = array_filter($data, function ($item) use ($minValue) {
|
||||
$isProduct = $item->{'@type'} === 'Product';
|
||||
$lowPrice = floatval($item->offers?->lowPrice ?? 0);
|
||||
$price = floatval($item->offers?->price ?? 0);
|
||||
|
||||
return $isProduct && ($lowPrice > $minValue) || ($price > $minValue);
|
||||
});
|
||||
|
||||
// Sort the items based on `ratingCount` and `ratingValue` in descending order
|
||||
usort($filtered, function ($a, $b) {
|
||||
$ratingCountA = intval($a->aggregateRating?->ratingCount ?? 0);
|
||||
$ratingCountB = intval($b->aggregateRating?->ratingCount ?? 0);
|
||||
|
||||
$ratingValueA = floatval($a->aggregateRating?->ratingValue ?? 0);
|
||||
$ratingValueB = floatval($b->aggregateRating?->ratingValue ?? 0);
|
||||
|
||||
if ($ratingCountA !== $ratingCountB) {
|
||||
return $ratingCountB - $ratingCountA;
|
||||
}
|
||||
|
||||
return $ratingValueB <=> $ratingValueA;
|
||||
});
|
||||
|
||||
// Map the filtered and sorted items to a new array of objects
|
||||
return array_map(function ($item) {
|
||||
return (object) [
|
||||
'name' => $item->name ?? null,
|
||||
'description' => $item->description ?? null,
|
||||
'url' => $item->url ?? null,
|
||||
'image' => $item->image ?? null,
|
||||
'lowPrice' => floatval($item->offers?->lowPrice ?? 0),
|
||||
'highPrice' => floatval($item->offers?->highPrice ?? 0),
|
||||
'price' => floatval($item->offers?->price ?? 0),
|
||||
'priceCurrency' => $item->offers?->priceCurrency ?? null,
|
||||
'ratingCount' => intval($item->aggregateRating?->ratingCount ?? 0),
|
||||
'ratingValue' => floatval($item->aggregateRating?->ratingValue ?? 0),
|
||||
];
|
||||
}, $filtered);
|
||||
}
|
||||
}
|
||||
@@ -1,236 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Spatie\Browsershot\Browsershot;
|
||||
use Spatie\Browsershot\Exceptions\UnsuccessfulResponse;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class UrlCrawlerTask
|
||||
{
|
||||
public static function handle(string $url, $directory, $postfix = null, $strip_html = false, $parse_images = false)
|
||||
{
|
||||
$slug = str_slug($url);
|
||||
|
||||
$cached_url = $url; // self::getGoogleCachedUrl($url, false);
|
||||
|
||||
$postfix = strval($postfix);
|
||||
|
||||
$driver = 'r2';
|
||||
$filename = $slug.'-'.$postfix.'.html';
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
$disk_url = $directory.$filename;
|
||||
|
||||
$raw_html = null;
|
||||
$status_code = 0;
|
||||
|
||||
$costs = [];
|
||||
|
||||
$unblocker_proxy_server = get_smartproxy_unblocker_server();
|
||||
$rotating_proxy_server = get_smartproxy_rotating_server();
|
||||
|
||||
try {
|
||||
$raw_html = OSSUploader::readFile($driver, $directory, $filename);
|
||||
|
||||
if (is_null($raw_html)) {
|
||||
$status_code = -1;
|
||||
throw new Exception('Not stored.');
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
$raw_html = null;
|
||||
}
|
||||
|
||||
if (is_null($raw_html)) {
|
||||
|
||||
try {
|
||||
|
||||
$response = Http::withHeaders([
|
||||
'User-Agent' => $user_agent,
|
||||
])
|
||||
->withOptions([
|
||||
'proxy' => $unblocker_proxy_server,
|
||||
'timeout' => 1000,
|
||||
'verify' => false,
|
||||
])
|
||||
->get($cached_url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$raw_html = $response->body();
|
||||
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'unblocker');
|
||||
} else {
|
||||
$raw_html = null;
|
||||
$status_code = -3;
|
||||
$response->throw();
|
||||
}
|
||||
|
||||
// $browsershot = new Browsershot();
|
||||
|
||||
// $browsershot->setUrl($cached_url)
|
||||
// ->setOption('args', ['headless: "new"'])
|
||||
// ->noSandbox()
|
||||
// ->setOption('args', ['--disable-web-security'])
|
||||
// ->userAgent($user_agent)
|
||||
// ->ignoreHttpsErrors()
|
||||
// ->preventUnsuccessfulResponse()
|
||||
// ->timeout(10)
|
||||
// ->setProxyServer($proxy_server)
|
||||
// ->userAgent($user_agent);
|
||||
|
||||
// if (app()->environment() == 'local') {
|
||||
// $browsershot->setNodeBinary(config('platform.general.node_binary'))->setNpmBinary(config('platform.general.npm_binary'));
|
||||
// }
|
||||
|
||||
// //dump($browsershot);
|
||||
|
||||
// $raw_html = $browsershot->bodyHtml();
|
||||
|
||||
// $sizeInKb = strlen($raw_html) / 1024; // Convert bytes to kilobytes
|
||||
// $browsershot_cost = round(calculate_smartproxy_cost($sizeInKb)) ;
|
||||
|
||||
// $costs['html'] = $browsershot_cost;
|
||||
|
||||
} catch (UnsuccessfulResponse|Exception $e) {
|
||||
$raw_html = null;
|
||||
$status_code = -3;
|
||||
throw $e;
|
||||
}
|
||||
|
||||
if (! is_empty($raw_html)) {
|
||||
OSSUploader::uploadFile($driver, $directory, $filename, $raw_html);
|
||||
$status_code = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (! is_null($raw_html)) {
|
||||
|
||||
//$raw_html = self::minifyAndCleanHtml($raw_html);
|
||||
|
||||
$jsonld = self::getJsonLd($raw_html);
|
||||
|
||||
return (object) [
|
||||
'response' => (object) [
|
||||
'url' => $url,
|
||||
'postfix' => $postfix,
|
||||
'filename' => $disk_url,
|
||||
'raw_html' => $raw_html,
|
||||
'jsonld' => $jsonld,
|
||||
'status_code' => $status_code,
|
||||
'costs' => $costs,
|
||||
'total_cost' => array_sum(array_values($costs)),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
return (object) [
|
||||
'response' => (object) [
|
||||
'url' => $url,
|
||||
'postfix' => $postfix,
|
||||
'filename' => null,
|
||||
'raw_html' => null,
|
||||
'jsonld' => [],
|
||||
'status_code' => $status_code,
|
||||
'costs' => $costs,
|
||||
'total_cost' => 0,
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private static function getJsonLd(string $raw_html)
|
||||
{
|
||||
$crawler = new Crawler($raw_html);
|
||||
|
||||
try {
|
||||
$jsonld = $crawler->filter('script[type="application/ld+json"]')->each(function (Crawler $node) {
|
||||
return $node->text();
|
||||
});
|
||||
} catch (Exception $e) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$contents = [];
|
||||
|
||||
foreach ($jsonld as $content) {
|
||||
try {
|
||||
$contents[] = json_decode($content);
|
||||
} catch (Exception $e) {
|
||||
}
|
||||
}
|
||||
|
||||
return $contents;
|
||||
}
|
||||
|
||||
private static function minifyAndCleanHtml(string $raw_html)
|
||||
{
|
||||
$raw_html = self::minifyHTML($raw_html);
|
||||
|
||||
$crawler = new Crawler($raw_html);
|
||||
|
||||
// Directly loop through the DOM and remove 'class' and 'id' attributes
|
||||
foreach ($crawler as $domElement) {
|
||||
/** @var \DOMNodeList $nodes */
|
||||
$nodes = $domElement->getElementsByTagName('*');
|
||||
foreach ($nodes as $node) {
|
||||
/** @var \DOMElement $node */
|
||||
$node->removeAttribute('class');
|
||||
$node->removeAttribute('id');
|
||||
$node->removeAttribute('style');
|
||||
}
|
||||
}
|
||||
|
||||
// Remove <style> tags and their content
|
||||
$styleTags = $domElement->getElementsByTagName('style');
|
||||
for ($i = $styleTags->length; --$i >= 0;) {
|
||||
$styleNode = $styleTags->item($i);
|
||||
$styleNode->parentNode->removeChild($styleNode);
|
||||
}
|
||||
|
||||
// Output the manipulated HTML
|
||||
return $crawler->html();
|
||||
}
|
||||
|
||||
private static function minifyHTML($input)
|
||||
{
|
||||
// Remove extra white space between HTML tags
|
||||
$input = preg_replace('/>\s+</', '><', $input);
|
||||
|
||||
// Remove comments
|
||||
$input = preg_replace('/<!--(.|\s)*?-->/', '', $input);
|
||||
|
||||
return $input;
|
||||
}
|
||||
|
||||
private static function getGoogleCachedUrl(string $url, $stripHtml = false)
|
||||
{
|
||||
$url = self::stripUrlQueryParameters($url);
|
||||
$cached_url = "https://webcache.googleusercontent.com/search?q=cache:{$url}";
|
||||
|
||||
if ($stripHtml) {
|
||||
$cached_url .= '&strip=1';
|
||||
}
|
||||
|
||||
return $cached_url;
|
||||
|
||||
}
|
||||
|
||||
private static function stripUrlQueryParameters(string $url)
|
||||
{
|
||||
// Parse the URL into its components
|
||||
$parts = parse_url($url);
|
||||
|
||||
// Rebuild the URL without the query component
|
||||
$newUrl = $parts['scheme'].'://'.$parts['host'];
|
||||
|
||||
if (isset($parts['path'])) {
|
||||
$newUrl .= $parts['path'];
|
||||
}
|
||||
|
||||
if (isset($parts['fragment'])) {
|
||||
$newUrl .= '#'.$parts['fragment'];
|
||||
}
|
||||
|
||||
return $newUrl;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user