Add (news bites)
This commit is contained in:
60
app/Jobs/Tasks/BrowseRSSLatestNewsTask.php
Normal file
60
app/Jobs/Tasks/BrowseRSSLatestNewsTask.php
Normal file
@@ -0,0 +1,60 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use Carbon\Carbon;
|
||||
use Vedmant\FeedReader\Facades\FeedReader;
|
||||
|
||||
class BrowseRSSLatestNewsTask
|
||||
{
|
||||
public static function handleMulti($hours = 3)
|
||||
{
|
||||
$rss_urls = config('platform.global.rss');
|
||||
|
||||
$raw_posts = [];
|
||||
|
||||
foreach ($rss_urls as $rss_url) {
|
||||
$this_rss_posts = array_merge(self::handleSingle($rss_url, $hours));
|
||||
|
||||
foreach ($this_rss_posts as $item) {
|
||||
$raw_posts[] = $item;
|
||||
}
|
||||
}
|
||||
|
||||
return $raw_posts;
|
||||
}
|
||||
|
||||
public static function handleSingle($rss_url, $hours = 3)
|
||||
{
|
||||
|
||||
$f = FeedReader::read($rss_url);
|
||||
|
||||
$raw_posts = [];
|
||||
|
||||
foreach ($f->get_items() as $item) {
|
||||
$post_datetime = Carbon::parse($item->get_date(\DateTime::ATOM));
|
||||
|
||||
if (! $post_datetime->isBetween(now()->subHours($hours), now())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$title = trim($item->get_title());
|
||||
$description = trim($item->get_content());
|
||||
|
||||
$raw_posts[] = (object) [
|
||||
'source' => $f->get_title(),
|
||||
'source_url' => $rss_url,
|
||||
'title' => $title,
|
||||
'link' => $item->get_link(),
|
||||
'description' => $description,
|
||||
'date' => $post_datetime,
|
||||
'category' => $item->get_category()?->term,
|
||||
];
|
||||
}
|
||||
|
||||
unset($f);
|
||||
|
||||
return $raw_posts;
|
||||
|
||||
}
|
||||
}
|
||||
167
app/Jobs/Tasks/CrawlRssPostTask.php
Normal file
167
app/Jobs/Tasks/CrawlRssPostTask.php
Normal file
@@ -0,0 +1,167 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Jobs\ParseRssPostMetadataJob;
|
||||
use App\Models\RssPost;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use League\HTMLToMarkdown\HtmlConverter;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class CrawlRssPostTask
|
||||
{
|
||||
public static function handle(int $rss_post_id)
|
||||
{
|
||||
$rss_post = RssPost::find($rss_post_id);
|
||||
|
||||
if (is_null($rss_post)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
|
||||
$response = Http::withHeaders([
|
||||
'User-Agent' => $user_agent,
|
||||
])
|
||||
->withOptions([
|
||||
'proxy' => get_smartproxy_rotating_server(),
|
||||
'timeout' => 10,
|
||||
'verify' => false,
|
||||
])
|
||||
->get($rss_post->post_url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$raw_html = $response->body();
|
||||
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
|
||||
} else {
|
||||
$raw_html = null;
|
||||
$response->throw();
|
||||
}
|
||||
|
||||
} catch (Exception $e) {
|
||||
$raw_html = null;
|
||||
}
|
||||
|
||||
if (! is_empty($raw_html)) {
|
||||
$rss_post->body = self::getMarkdownFromHtml($raw_html);
|
||||
} else {
|
||||
$rss_post->body = 'EMPTY CONTENT';
|
||||
}
|
||||
|
||||
if ((is_empty($rss_post->body)) || ($rss_post->body == 'EMPTY CONTENT') || (strlen($rss_post->body) < 800)){
|
||||
$rss_post->status = 'blocked';
|
||||
}
|
||||
|
||||
if ($rss_post->save()) {
|
||||
|
||||
if (! in_array($rss_post->status, ['blocked', 'trashed'])) {
|
||||
ParseRssPostMetadataJob::dispatch($rss_post->id)->onConnection('default')->onQueue('default');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static function getMarkdownFromHtml($html)
|
||||
{
|
||||
|
||||
$converter = new HtmlConverter([
|
||||
'strip_tags' => true,
|
||||
'strip_placeholder_links' => true,
|
||||
]);
|
||||
|
||||
$html = self::cleanHtml($html);
|
||||
|
||||
$markdown = $converter->convert($html);
|
||||
|
||||
//dd($markdown);
|
||||
|
||||
$markdown = self::reverseLTGT($markdown);
|
||||
|
||||
$markdown = self::normalizeNewLines($markdown);
|
||||
|
||||
$markdown = self::removeDuplicateLines($markdown);
|
||||
|
||||
return html_entity_decode(markdown_to_plaintext($markdown));
|
||||
}
|
||||
|
||||
private static function reverseLTGT($input)
|
||||
{
|
||||
$output = str_replace('<', '<', $input);
|
||||
$output = str_replace('>', '>', $output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
private static function removeDuplicateLines($string)
|
||||
{
|
||||
$lines = explode("\n", $string);
|
||||
$uniqueLines = array_unique($lines);
|
||||
|
||||
return implode("\n", $uniqueLines);
|
||||
}
|
||||
|
||||
private static function normalizeNewLines($content)
|
||||
{
|
||||
// Split the content by lines
|
||||
$lines = explode("\n", $content);
|
||||
|
||||
$processedLines = [];
|
||||
|
||||
for ($i = 0; $i < count($lines); $i++) {
|
||||
$line = trim($lines[$i]);
|
||||
|
||||
// If the line is an image markdown
|
||||
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
|
||||
// And if the next line is not empty and not another markdown structure
|
||||
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
|
||||
$line .= ' '.trim($lines[$i + 1]);
|
||||
$i++; // Skip the next line as we're merging it
|
||||
}
|
||||
}
|
||||
|
||||
// Add line to processedLines if it's not empty
|
||||
if (! empty($line)) {
|
||||
$processedLines[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse excessive newlines
|
||||
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
|
||||
|
||||
// Detect and replace the pattern
|
||||
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
|
||||
|
||||
// Replace multiple spaces with a dash separator
|
||||
$result = preg_replace('/ {2,}/', ' - ', $result);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private static function cleanHtml($htmlContent)
|
||||
{
|
||||
$crawler = new Crawler($htmlContent);
|
||||
|
||||
// Define tags to remove completely
|
||||
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
|
||||
|
||||
foreach ($tagsToRemove as $tag) {
|
||||
$crawler->filter($tag)->each(function ($node) {
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->removeChild($child);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Replace <span> tags with their inner content
|
||||
$crawler->filter('span')->each(function ($node) {
|
||||
$replacement = new \DOMText($node->text());
|
||||
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->replaceChild($replacement, $child);
|
||||
}
|
||||
});
|
||||
|
||||
return $crawler->outerHtml();
|
||||
}
|
||||
}
|
||||
@@ -176,8 +176,6 @@ private static function setPostImage($post)
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
|
||||
try {
|
||||
|
||||
$main_image_url = $serp_url_research->main_image;
|
||||
@@ -215,7 +213,7 @@ private static function setPostImage($post)
|
||||
$image->destroy();
|
||||
|
||||
break;
|
||||
|
||||
|
||||
} catch (Exception $e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -96,9 +96,8 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($serp_url->picked == true)
|
||||
{
|
||||
continue;
|
||||
if ($serp_url->picked == true) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
117
app/Jobs/Tasks/ParseRssPostMetadataTask.php
Normal file
117
app/Jobs/Tasks/ParseRssPostMetadataTask.php
Normal file
@@ -0,0 +1,117 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Models\Category;
|
||||
use App\Models\RssPost;
|
||||
use App\Models\ServiceCostUsage;
|
||||
|
||||
class ParseRssPostMetadataTask
|
||||
{
|
||||
public static function handle(int $rss_post_id)
|
||||
{
|
||||
$rss_post = RssPost::find($rss_post_id);
|
||||
|
||||
if (is_null($rss_post)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (in_array($rss_post->status, ['blocked', 'trashed'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (! is_null($rss_post->metadata)) {
|
||||
$post_meta_response = $rss_post->metadata;
|
||||
} else {
|
||||
$post_meta_response = OpenAI::getRssPostMeta($rss_post->body, 1536, 30);
|
||||
|
||||
if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $post_meta_response->cost;
|
||||
$service_cost_usage->name = 'openai-getRssPostMeta';
|
||||
$service_cost_usage->reference_1 = 'rss_post';
|
||||
$service_cost_usage->reference_2 = strval($rss_post->id);
|
||||
$service_cost_usage->output = $post_meta_response;
|
||||
$service_cost_usage->save();
|
||||
}
|
||||
}
|
||||
|
||||
$words_to_add_in_body = [];
|
||||
|
||||
if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
|
||||
|
||||
$rss_post->metadata = $post_meta_response;
|
||||
|
||||
if (isset($post_meta_response->output->title)) {
|
||||
if (! is_empty($post_meta_response->output->title)) {
|
||||
$rss_post->title = $post_meta_response->output->title;
|
||||
$rss_post->slug = ($post_meta_response->output->title);
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->keywords)) {
|
||||
if (count($post_meta_response->output->keywords) > 0) {
|
||||
$rss_post->keywords = $post_meta_response->output->keywords;
|
||||
|
||||
foreach ($post_meta_response->output->keywords as $word)
|
||||
{
|
||||
$words_to_add_in_body[] = $word;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->entities)) {
|
||||
if (count($post_meta_response->output->entities) > 0) {
|
||||
$rss_post->entities = $post_meta_response->output->entities;
|
||||
|
||||
foreach ($post_meta_response->output->entities as $word)
|
||||
{
|
||||
$words_to_add_in_body[] = $word;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->summary)) {
|
||||
if (! is_empty($post_meta_response->output->summary)) {
|
||||
$rss_post->bites = $post_meta_response->output->summary;
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->society_impact)) {
|
||||
if (! is_empty($post_meta_response->output->society_impact)) {
|
||||
$rss_post->impact = $post_meta_response->output->society_impact;
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->society_impact_level)) {
|
||||
if (! is_empty($post_meta_response->output->society_impact_level)) {
|
||||
$rss_post->impact_level = $post_meta_response->output->society_impact_level;
|
||||
}
|
||||
}
|
||||
|
||||
// Category
|
||||
$category_name = 'Updates';
|
||||
|
||||
if ((isset($post_meta_response->output->category)) && (! is_empty($post_meta_response->output->category))) {
|
||||
$category_name = $post_meta_response?->output?->category;
|
||||
}
|
||||
|
||||
$category = Category::where('name', $category_name)->first();
|
||||
|
||||
if (is_null($category)) {
|
||||
$category = Category::where('name', 'Updates')->first();
|
||||
}
|
||||
|
||||
$rss_post->category_id = $category->id;
|
||||
}
|
||||
|
||||
$post_body = $rss_post->body;
|
||||
$post_body .= implode($words_to_add_in_body);
|
||||
$rss_post->body = $post_body;
|
||||
|
||||
$rss_post->status = 'published';
|
||||
$rss_post->save();
|
||||
|
||||
}
|
||||
}
|
||||
@@ -3,11 +3,11 @@
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Models\Post;
|
||||
use App\Notifications\PostWasPublished;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Notification;
|
||||
use LaravelFreelancerNL\LaravelIndexNow\Facades\IndexNow;
|
||||
use LaravelGoogleIndexing;
|
||||
use Illuminate\Support\Facades\Notification;
|
||||
use App\Notifications\PostWasPublished;
|
||||
|
||||
class PublishIndexPostTask
|
||||
{
|
||||
@@ -37,12 +37,7 @@ public static function handle(int $post_id)
|
||||
|
||||
}
|
||||
|
||||
Notification::route('facebook','default')->notify(new PostWasPublished($post));
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Notification::route('facebook', 'default')->notify(new PostWasPublished($post));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user