From 8c0ae63f42928a2f244b6b8749e9237595afa098 Mon Sep 17 00:00:00 2001 From: Charles T Date: Wed, 29 Nov 2023 12:24:15 +0800 Subject: [PATCH] Update (parsing): skip words below 120 --- app/Helpers/Global/string_helper.php | 14 ++++++++++++++ app/Jobs/Tasks/ParseUrlBodyTask.php | 9 +++++++++ database/seeders/NewCategorySeeder.php | 2 +- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/app/Helpers/Global/string_helper.php b/app/Helpers/Global/string_helper.php index 475cd6a..a476129 100644 --- a/app/Helpers/Global/string_helper.php +++ b/app/Helpers/Global/string_helper.php @@ -3,6 +3,20 @@ use Carbon\Carbon; use Illuminate\Support\Str; +if (! function_exists('count_words')) { + function count_words($string) { + // Remove punctuation and line breaks + $cleanString = preg_replace('/[\p{P}\s]/u', ' ', $string); + + // Split the string into words + $words = preg_split('/\s+/', $cleanString, -1, PREG_SPLIT_NO_EMPTY); + + // Count the words + return count($words); + } +} + + if (! function_exists('dmy')) { function dmy(Carbon $carbon) { diff --git a/app/Jobs/Tasks/ParseUrlBodyTask.php b/app/Jobs/Tasks/ParseUrlBodyTask.php index 70172af..0fb1c2d 100644 --- a/app/Jobs/Tasks/ParseUrlBodyTask.php +++ b/app/Jobs/Tasks/ParseUrlBodyTask.php @@ -31,6 +31,15 @@ public static function handle(int $url_to_crawl_id) if (is_empty($url_to_crawl->output)) { ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default'); + return ; + } + + if (count_words($url_to_crawl->output) < 120) + { + $url_to_crawl->status = 'blocked'; + $url_to_crawl->save(); + + return ; } $url_meta_response = null; diff --git a/database/seeders/NewCategorySeeder.php b/database/seeders/NewCategorySeeder.php index da1a6c5..889ed6b 100644 --- a/database/seeders/NewCategorySeeder.php +++ b/database/seeders/NewCategorySeeder.php @@ -14,7 +14,7 @@ public function run(): void { $parent_categories = [ - ['name' => 'NFSW', 'emoji' => '👅', 'is_top' => true], + ['name' => 'NSFW', 'emoji' => '👅', 'is_top' => true], ]; foreach ($parent_categories as $item) {