From 6580463fcb870f3518385a49743cd7f66823f020 Mon Sep 17 00:00:00 2001 From: Charles T Date: Wed, 29 Nov 2023 12:46:37 +0800 Subject: [PATCH] Update (fix): infinite loop --- app/Jobs/Tasks/GetUrlBodyTask.php | 14 ++++++++-- app/Jobs/Tasks/ParseUrlBodyTask.php | 3 +- app/Models/UrlToCrawl.php | 2 ++ ...dd_crawl_counts_to_url_to_crawls_table.php | 28 +++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 database/migrations/2023_11_29_043747_add_crawl_counts_to_url_to_crawls_table.php diff --git a/app/Jobs/Tasks/GetUrlBodyTask.php b/app/Jobs/Tasks/GetUrlBodyTask.php index 2073355..650ff50 100644 --- a/app/Jobs/Tasks/GetUrlBodyTask.php +++ b/app/Jobs/Tasks/GetUrlBodyTask.php @@ -23,12 +23,17 @@ public static function handle(int $url_to_crawl_id) return null; } + if (in_array($url_to_crawl->status, ['blocked', 'trashed'])) { + return; + } + $enable_proxy = false; $url_to_crawl->is_crawling = true; $url_to_crawl->save(); $url_to_crawl->refresh(); + // try { $user_agent = config('platform.proxy.user_agent'); @@ -66,9 +71,13 @@ public static function handle(int $url_to_crawl_id) // //throw $e; // } - if (! is_empty($raw_html)) { + + $markdown_output = self::getMarkdownFromHtml($raw_html); + + + if (! is_empty($markdown_output)) { $url_to_crawl->output_type = 'markdown'; - $url_to_crawl->output = self::getMarkdownFromHtml($raw_html); + $url_to_crawl->output = $markdown_output; } else { $url_to_crawl->output = 'EMPTY CONTENT'; @@ -76,6 +85,7 @@ public static function handle(int $url_to_crawl_id) } $url_to_crawl->is_crawled = true; + $url_to_crawl->crawl_counts = $url_to_crawl->crawl_counts + 1; if ($url_to_crawl->save()) { if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) { diff --git a/app/Jobs/Tasks/ParseUrlBodyTask.php b/app/Jobs/Tasks/ParseUrlBodyTask.php index 0fb1c2d..be7fbd6 100644 --- a/app/Jobs/Tasks/ParseUrlBodyTask.php +++ b/app/Jobs/Tasks/ParseUrlBodyTask.php @@ -4,6 +4,7 @@ use App\Helpers\FirstParty\OpenAI\OpenAI; use App\Jobs\GetAIToolScreenshotJob; +use App\Jobs\GetUrlBodyJob; use App\Jobs\ParseUrlBodyJob; use App\Jobs\StoreSearchEmbeddingJob; use App\Models\AiTool; @@ -30,7 +31,7 @@ public static function handle(int $url_to_crawl_id) } if (is_empty($url_to_crawl->output)) { - ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default'); + GetUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default'); return ; } diff --git a/app/Models/UrlToCrawl.php b/app/Models/UrlToCrawl.php index e00d9bc..7c11704 100644 --- a/app/Models/UrlToCrawl.php +++ b/app/Models/UrlToCrawl.php @@ -32,6 +32,7 @@ class UrlToCrawl extends Model 'is_crawling' => 'bool', 'is_crawled' => 'bool', 'metadata' => 'object', + 'crawl_counts' => 'integer', ]; protected $fillable = [ @@ -43,5 +44,6 @@ class UrlToCrawl extends Model 'output', 'metadata', 'status', + 'crawl_counts', ]; } diff --git a/database/migrations/2023_11_29_043747_add_crawl_counts_to_url_to_crawls_table.php b/database/migrations/2023_11_29_043747_add_crawl_counts_to_url_to_crawls_table.php new file mode 100644 index 0000000..55988f6 --- /dev/null +++ b/database/migrations/2023_11_29_043747_add_crawl_counts_to_url_to_crawls_table.php @@ -0,0 +1,28 @@ +integer('crawl_counts')->default(0); + }); + } + + /** + * Reverse the migrations. + */ + public function down(): void + { + Schema::table('url_to_crawls', function (Blueprint $table) { + $table->dropColumn('crawl_counts'); + }); + } +};