Update (parsing): skip words below 120
This commit is contained in:
@@ -3,6 +3,20 @@
|
||||
use Carbon\Carbon;
|
||||
use Illuminate\Support\Str;
|
||||
|
||||
if (! function_exists('count_words')) {
|
||||
function count_words($string) {
|
||||
// Remove punctuation and line breaks
|
||||
$cleanString = preg_replace('/[\p{P}\s]/u', ' ', $string);
|
||||
|
||||
// Split the string into words
|
||||
$words = preg_split('/\s+/', $cleanString, -1, PREG_SPLIT_NO_EMPTY);
|
||||
|
||||
// Count the words
|
||||
return count($words);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (! function_exists('dmy')) {
|
||||
function dmy(Carbon $carbon)
|
||||
{
|
||||
|
||||
@@ -31,6 +31,15 @@ public static function handle(int $url_to_crawl_id)
|
||||
|
||||
if (is_empty($url_to_crawl->output)) {
|
||||
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
|
||||
return ;
|
||||
}
|
||||
|
||||
if (count_words($url_to_crawl->output) < 120)
|
||||
{
|
||||
$url_to_crawl->status = 'blocked';
|
||||
$url_to_crawl->save();
|
||||
|
||||
return ;
|
||||
}
|
||||
|
||||
$url_meta_response = null;
|
||||
|
||||
@@ -14,7 +14,7 @@ public function run(): void
|
||||
{
|
||||
|
||||
$parent_categories = [
|
||||
['name' => 'NFSW', 'emoji' => '👅', 'is_top' => true],
|
||||
['name' => 'NSFW', 'emoji' => '👅', 'is_top' => true],
|
||||
];
|
||||
|
||||
foreach ($parent_categories as $item) {
|
||||
|
||||
Reference in New Issue
Block a user