Update (parsing): skip words below 120

This commit is contained in:
2023-11-29 12:24:15 +08:00
parent c70b0bccd4
commit 8c0ae63f42
3 changed files with 24 additions and 1 deletions

View File

@@ -3,6 +3,20 @@
use Carbon\Carbon;
use Illuminate\Support\Str;
if (! function_exists('count_words')) {
function count_words($string) {
// Remove punctuation and line breaks
$cleanString = preg_replace('/[\p{P}\s]/u', ' ', $string);
// Split the string into words
$words = preg_split('/\s+/', $cleanString, -1, PREG_SPLIT_NO_EMPTY);
// Count the words
return count($words);
}
}
if (! function_exists('dmy')) {
function dmy(Carbon $carbon)
{

View File

@@ -31,6 +31,15 @@ public static function handle(int $url_to_crawl_id)
if (is_empty($url_to_crawl->output)) {
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
return ;
}
if (count_words($url_to_crawl->output) < 120)
{
$url_to_crawl->status = 'blocked';
$url_to_crawl->save();
return ;
}
$url_meta_response = null;

View File

@@ -14,7 +14,7 @@ public function run(): void
{
$parent_categories = [
['name' => 'NFSW', 'emoji' => '👅', 'is_top' => true],
['name' => 'NSFW', 'emoji' => '👅', 'is_top' => true],
];
foreach ($parent_categories as $item) {