From 1f7bfb130ee03fdc50299186c679bd511bb8acd7 Mon Sep 17 00:00:00 2001 From: Charles Teh Date: Thu, 23 Nov 2023 21:40:53 +0800 Subject: [PATCH] Update (rss): add domain blacklist --- app/Jobs/Tasks/BrowseRSSLatestNewsTask.php | 14 ++++++ app/Jobs/Tasks/ParseRssPostMetadataTask.php | 53 ++++++++++++++------- config/platform/global.php | 7 +++ routes/tests.php | 2 + 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/app/Jobs/Tasks/BrowseRSSLatestNewsTask.php b/app/Jobs/Tasks/BrowseRSSLatestNewsTask.php index 8ad1f28..f78de74 100644 --- a/app/Jobs/Tasks/BrowseRSSLatestNewsTask.php +++ b/app/Jobs/Tasks/BrowseRSSLatestNewsTask.php @@ -11,6 +11,7 @@ public static function handleMulti($hours = 3) { $rss_urls = config('platform.global.rss'); + $raw_posts = []; foreach ($rss_urls as $rss_url) { @@ -26,6 +27,8 @@ public static function handleMulti($hours = 3) public static function handleSingle($rss_url, $hours = 3) { + $blacklist_rss_post_domain = config('platform.global.blacklist_rss_post_domain'); + $f = FeedReader::read($rss_url); @@ -41,6 +44,17 @@ public static function handleSingle($rss_url, $hours = 3) $title = trim($item->get_title()); $description = trim($item->get_content()); + + $domain = get_domain_from_url($item->get_link()); + + if (in_array($domain, $blacklist_rss_post_domain)) + { + continue ; + } + + + + $raw_posts[] = (object) [ 'source' => $f->get_title(), 'source_url' => $rss_url, diff --git a/app/Jobs/Tasks/ParseRssPostMetadataTask.php b/app/Jobs/Tasks/ParseRssPostMetadataTask.php index 0d1970a..4b57fcf 100644 --- a/app/Jobs/Tasks/ParseRssPostMetadataTask.php +++ b/app/Jobs/Tasks/ParseRssPostMetadataTask.php @@ -141,34 +141,51 @@ public static function handle(int $rss_post_id) $rss_post->keyword_list = implode(',', $words_to_add_in_keyword_list); - $rss_post->status = 'published'; + + if (is_empty($rss_post->bites)) + { + $rss_post->status = 'blocked'; + } + else + { + $rss_post->status = 'published'; + } + if ($rss_post->save()) { - $has_saved_keyword = false; - $deleted_rpk = RssPostKeyword::where('rss_post_id', $rss_post->id)->delete(); - foreach ($words_to_save as $word_to_save) { + if ($rss_post->status == 'published') + { - $new_rpk = new RssPostKeyword; - $new_rpk->rss_post_id = $rss_post->id; - $new_rpk->type = $word_to_save->type; - $new_rpk->is_main = $word_to_save->is_main; - $new_rpk->value = $word_to_save->value; - $new_rpk->value_lowercased = $word_to_save->value_lowercased; + $has_saved_keyword = false; + + $deleted_rpk = RssPostKeyword::where('rss_post_id', $rss_post->id)->delete(); + + foreach ($words_to_save as $word_to_save) { + + $new_rpk = new RssPostKeyword; + $new_rpk->rss_post_id = $rss_post->id; + $new_rpk->type = $word_to_save->type; + $new_rpk->is_main = $word_to_save->is_main; + $new_rpk->value = $word_to_save->value; + $new_rpk->value_lowercased = $word_to_save->value_lowercased; + + if ($new_rpk->save()) { + if (! $has_saved_keyword) { + $has_saved_keyword = true; + } - if ($new_rpk->save()) { - if (! $has_saved_keyword) { - $has_saved_keyword = true; } - } + + if ($has_saved_keyword) { + $rss_post->keyword_saved = true; + $rss_post->save(); + } + } - if ($has_saved_keyword) { - $rss_post->keyword_saved = true; - $rss_post->save(); - } } } diff --git a/config/platform/global.php b/config/platform/global.php index a84e914..21560e3 100644 --- a/config/platform/global.php +++ b/config/platform/global.php @@ -10,6 +10,13 @@ 'https://u.today', ], + 'blacklist_rss_post_domain' => [ + 'www.techinasia.com', + 'www.marktechpost.com', + 'twitter.com', + 'www.youtube.com', + ], + 'rss' => [ 'http://news.ycombinator.com/rss', // 'http://blog.samaltman.com/posts.atom', diff --git a/routes/tests.php b/routes/tests.php index e0fb71c..d69d0e9 100644 --- a/routes/tests.php +++ b/routes/tests.php @@ -57,6 +57,8 @@ $last_record = RssPost::whereNull('post_domain')->orderBy('id', 'DESC')->first(); + dd($last_record); + for ($i = 1; $i <= $last_record->id; $i++) { $rss_post = RssPost::find($i);