Add (initial): futurewalker code

2023-11-20 00:15:18 +08:00
parent f8602cb456
commit 9ce3e5c82a
166 changed files with 15941 additions and 1072 deletions
--- a/app/Jobs/Tasks/BrowseDFSForResearchTask.php
+++ b/app/Jobs/Tasks/BrowseDFSForResearchTask.php
@@ -0,0 +1,132 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Helpers\FirstParty\DFS\DFSSerp;
+use App\Helpers\FirstParty\OpenAI\OpenAI;
+use App\Helpers\ThirdParty\DFS\SettingSerpLiveAdvanced;
+use App\Jobs\CrawlUrlResearchJob;
+use App\Models\SerpUrl;
+use App\Models\SerpUrlResearch;
+use App\Models\ServiceCostUsage;
+use Exception;
+
+class BrowseDFSForResearchTask
+{
+    public static function handle(int $serp_url_id)
+    {
+        $serp_url = SerpUrl::find($serp_url_id);
+
+        if ((! is_null($serp_url)) && (! is_null($serp_url->suggestion_data))) {
+            if (isset($serp_url->suggestion_data->proposed_search_queries)) {
+                if (count($serp_url->suggestion_data->proposed_search_queries) > 0) {
+                    $search_query = $serp_url->suggestion_data->proposed_search_queries[0];
+
+                    // $serp_model = new SettingSerpLiveAdvanced;
+
+                    // $serp_model->setSe('google');
+                    // $serp_model->setSeType('organic');
+                    // $serp_model->setKeyword(strtolower($search_query));
+                    // $serp_model->setLocationName('United States');
+                    // //$serp_model->setDepth(100);
+                    // $serp_model->setLanguageCode('en');
+                    // $serp_res = $serp_model->getAsJson();
+
+                    // print_r($serp_res);
+                    // die();
+                    $country_name = get_country_name_by_iso($serp_url->country_iso);
+
+                    $serp_res = DFSSerp::liveAdvanced('google', 'news', $search_query, $country_name, 'en', 100);
+
+                    try {
+                        $serp_obj = json_decode($serp_res, false, 512, JSON_THROW_ON_ERROR);
+
+                        if ($serp_obj?->status_code == 20000) {
+
+                            $service_cost_usage = new ServiceCostUsage;
+                            $service_cost_usage->cost = $serp_obj->cost;
+                            $service_cost_usage->name = 'dataforseo-GoogleSerpApiAdvancedLiveOrganic';
+                            $service_cost_usage->reference_1 = 'google';
+                            $service_cost_usage->reference_2 = 'organic';
+                            $service_cost_usage->output = $serp_obj;
+                            $service_cost_usage->input_1 = $country_name;
+                            $service_cost_usage->input_2 = $search_query;
+                            $service_cost_usage->save();
+
+                            $results = $serp_obj?->tasks[0]->result[0]?->items;
+
+                            //$results = $serp_obj?->result[0]?->items;
+
+                            // dump($serp_obj);
+                            // exit();
+
+                            $saved_count = 0;
+
+                            $first_serp_url_research = null;
+
+                            foreach ($results as $key => $result) {
+                                if ($result->type == 'news_search') {
+                                    $serp_url_research = SerpUrlResearch::where('url', $result->url)->where('serp_url_id', $serp_url_id)->first();
+
+                                    if (is_null($serp_url_research)) {
+                                        //dump($result->url);
+
+                                        $serp_url_research = new SerpUrlResearch;
+                                        $serp_url_research->serp_url_id = $serp_url_id;
+                                        $serp_url_research->url = $result->url;
+                                        $serp_url_research->query = $search_query;
+                                        $serp_url_research->content = null;
+                                        if ($serp_url_research->save()) {
+                                            $saved_count++;
+                                        }
+                                    }
+                                }
+                                if ($saved_count >= 10) {
+                                    break;
+                                }
+                            }
+
+                            $first_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_id)->orderBy('created_at', 'ASC')->whereNull('content')->first();
+
+                            CrawlUrlResearchJob::dispatch($first_serp_url_research->id)->onQueue('default')->onConnection('default');
+                        }
+                    } catch (Exception $e) {
+                        throw $e;
+                    }
+
+                }
+            }
+        }
+    }
+}
+
+// {
+//    "identified_keywords":[
+//       "Humane AI Pin",
+//       "costs",
+//       "OpenAI",
+//       "T-Mobile integration"
+//    ],
+//    "related_keywords":[
+//       "artificial intelligence device",
+//       "monthly subscription",
+//       "OpenAI partnership",
+//       "T-Mobile collaboration"
+//    ],
+//    "proposed_search_queries":[
+//       "Humane AI Pin features",
+//       "Cost of Humane AI Pin",
+//       "Humane AI Pin integration with OpenAI and T-Mobile",
+//       "Reviews of Humane AI Pin"
+//    ],
+//    "writing_tone":[
+//       "engaging",
+//       "informative"
+//    ],
+//    "article_headings":[
+//       "Introduction to Humane AI Pin",
+//       "Features of Humane AI Pin",
+//       "Cost and Subscription Details",
+//       "OpenAI and T-Mobile Integration"
+//    ]
+// }
--- a/app/Jobs/Tasks/BrowseDFSLatestNewsTask.php
+++ b/app/Jobs/Tasks/BrowseDFSLatestNewsTask.php
@@ -2,46 +2,59 @@

 namespace App\Jobs\Tasks;

+use App\Helpers\FirstParty\DFS\DFSSerp;
 use App\Helpers\FirstParty\OSSUploader\OSSUploader;
 use App\Helpers\ThirdParty\DFS\SettingSerpLiveAdvanced;
-use App\Models\Category;
 use App\Models\NewsSerpResult;
+use App\Models\ServiceCostUsage;
 use DFSClientV3\DFSClient;
 use Exception;
 use Illuminate\Support\Facades\Log;

-class GetNewsSerpTask
+class BrowseDFSLatestNewsTask
 {
-    public static function handle(Category $category, $country_iso)
+    public static function handle(string $keyword, $country_iso)
    {
        $country_name = get_country_name_by_iso($country_iso);

-        $keyword = strtolower("{$category->name}");
+        // $client = new DFSClient(
+        //     config('dataforseo.login'),
+        //     config('dataforseo.password'),
+        //     config('dataforseo.timeout'),
+        //     config('dataforseo.api_version'),
+        //     config('dataforseo.url'),
+        // );

-        $client = new DFSClient(
-            config('dataforseo.login'),
-            config('dataforseo.password'),
-            config('dataforseo.timeout'),
-            config('dataforseo.api_version'),
-            config('dataforseo.url'),
-        );
+        // // You will receive SERP data specific to the indicated keyword, search engine, and location parameters
+        // $serp_model = new SettingSerpLiveAdvanced();

-        // You will receive SERP data specific to the indicated keyword, search engine, and location parameters
-        $serp_model = new SettingSerpLiveAdvanced();
+        // $serp_model->setSe('google');
+        // $serp_model->setSeType('news');
+        // $serp_model->setSearchParam('&tbs=qdr:d');
+        // $serp_model->setKeyword($keyword);
+        // $serp_model->setLocationName($country_name);
+        // $serp_model->setDepth(100);
+        // $serp_model->setLanguageCode('en');
+        // $serp_res = $serp_model->getAsJson();

-        $serp_model->setSe('google');
-        $serp_model->setSeType('news');
-        $serp_model->setKeyword($keyword);
-        $serp_model->setLocationName($country_name);
-        $serp_model->setDepth(100);
-        $serp_model->setLanguageCode('en');
-        $serp_res = $serp_model->getAsJson();
+        $serp_res = DFSSerp::liveAdvanced('google', 'news', $keyword, $country_name, 'en', 100, '&tbs=qdr:d');

        try {
            $serp_obj = json_decode($serp_res, false, 512, JSON_THROW_ON_ERROR);

            if ($serp_obj?->status_code == 20000) {
-                $json_file_name = config('platform.dataset.news.news_serp.file_prefix').str_slug($category->name).'-'.epoch_now_timestamp().'.json';
+
+                $service_cost_usage = new ServiceCostUsage;
+                $service_cost_usage->cost = $serp_obj->cost;
+                $service_cost_usage->name = 'dataforseo-GoogleSerpApiAdvancedLiveNews';
+                $service_cost_usage->reference_1 = 'google';
+                $service_cost_usage->reference_2 = 'news';
+                $service_cost_usage->output = $serp_obj;
+                $service_cost_usage->input_1 = $country_name;
+                $service_cost_usage->input_2 = $keyword;
+                $service_cost_usage->save();
+
+                $json_file_name = config('platform.dataset.news.news_serp.file_prefix').str_slug($keyword).'-'.epoch_now_timestamp().'.json';

                $upload_status = OSSUploader::uploadJson(
                    config('platform.dataset.news.news_serp.driver'),
@@ -50,9 +63,8 @@ public static function handle(Category $category, $country_iso)
                    $serp_obj);

                if ($upload_status) {
+
                    $news_serp_result = new NewsSerpResult;
-                    $news_serp_result->category_id = $category->id;
-                    $news_serp_result->category_name = $category->name;
                    $news_serp_result->serp_provider = 'dfs';
                    $news_serp_result->serp_se = 'google';
                    $news_serp_result->serp_se_type = 'news';
@@ -62,10 +74,7 @@ public static function handle(Category $category, $country_iso)
                    $news_serp_result->result_count = $serp_obj?->tasks[0]?->result[0]?->items_count;
                    $news_serp_result->filename = $json_file_name;
                    $news_serp_result->status = 'initial';
-                    if ($news_serp_result->save()) {
-                        $category->serp_at = now();
-                        $category->save();
-                    }
+                    $news_serp_result->save();

                    return $news_serp_result;
                } else {
--- a/app/Jobs/Tasks/CrawlUrlResearchTask.php
+++ b/app/Jobs/Tasks/CrawlUrlResearchTask.php
@@ -0,0 +1,207 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Jobs\CrawlUrlResearchJob;
+use App\Jobs\WriteWithAIJob;
+use App\Models\SerpUrl;
+use App\Models\SerpUrlResearch;
+use Exception;
+use fivefilters\Readability\Configuration as ReadabilityConfiguration;
+use fivefilters\Readability\ParseException as ReadabilityParseException;
+use fivefilters\Readability\Readability;
+use Illuminate\Support\Facades\Http;
+use League\HTMLToMarkdown\HtmlConverter;
+use Symfony\Component\DomCrawler\Crawler;
+
+class CrawlUrlResearchTask
+{
+    public static function handle(int $serp_url_research_id)
+    {
+        $serp_url_research = SerpUrlResearch::find($serp_url_research_id);
+
+        if (is_null($serp_url_research)) {
+            return null;
+        }
+
+        try {
+            $user_agent = config('platform.proxy.user_agent');
+
+            $response = Http::withHeaders([
+                'User-Agent' => $user_agent,
+            ])
+                ->withOptions([
+                    'proxy' => get_smartproxy_rotating_server(),
+                    'timeout' => 10,
+                    'verify' => false,
+                ])
+                ->get($serp_url_research->url);
+
+            if ($response->successful()) {
+                $raw_html = $response->body();
+                $costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
+            } else {
+                $raw_html = null;
+                $response->throw();
+            }
+
+        } catch (Exception $e) {
+            $raw_html = null;
+            //throw $e;
+        }
+
+        if (! is_empty($raw_html)) {
+            //dump(self::getMarkdownFromHtml($raw_html));
+
+            $serp_url_research->content = self::getMarkdownFromHtml($raw_html);
+            $serp_url_research->main_image = self::getMainImageFromHtml($raw_html);
+
+        //dump($serp_url_research->content);
+        } else {
+            $serp_url_research->content = 'EMPTY CONTENT';
+        }
+
+        $serp_url_research->save();
+
+        $completed_serp_url_researches_counts = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->count();
+
+        if ($completed_serp_url_researches_counts >= 3) {
+            $serp_url = SerpUrl::find($serp_url_research->serp_url_id);
+
+            if (! is_null($serp_url)) {
+                $serp_url->crawled = true;
+                $serp_url->save();
+
+                WriteWithAIJob::dispatch($serp_url->id)->onQueue('default')->onConnection('default');
+            }
+        } else {
+            $next_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->whereNull('content')->first();
+
+            if (! is_null($next_serp_url_research)) {
+                CrawlUrlResearchJob::dispatch($next_serp_url_research->id)->onQueue('default')->onConnection('default');
+            }
+
+        }
+    }
+
+    private static function getMainImageFromHtml($html)
+    {
+        $r_configuration = new ReadabilityConfiguration();
+        $r_configuration->setCharThreshold(20);
+
+        $readability = new Readability($r_configuration);
+
+        try {
+            $readability->parse($html);
+
+            return $readability->getImage();
+            //dd($readability);
+        } catch (ReadabilityParseException $e) {
+        }
+
+        return null;
+    }
+
+    private static function getMarkdownFromHtml($html)
+    {
+
+        $converter = new HtmlConverter([
+            'strip_tags' => true,
+            'strip_placeholder_links' => true,
+        ]);
+
+        $html = self::cleanHtml($html);
+
+        $markdown = $converter->convert($html);
+
+        //dd($markdown);
+
+        $markdown = self::reverseLTGT($markdown);
+
+        $markdown = self::normalizeNewLines($markdown);
+
+        $markdown = self::removeDuplicateLines($markdown);
+
+        return html_entity_decode(markdown_to_plaintext($markdown));
+    }
+
+    private static function reverseLTGT($input)
+    {
+        $output = str_replace('&lt;', '<', $input);
+        $output = str_replace('&gt;', '>', $output);
+
+        return $output;
+    }
+
+    private static function removeDuplicateLines($string)
+    {
+        $lines = explode("\n", $string);
+        $uniqueLines = array_unique($lines);
+
+        return implode("\n", $uniqueLines);
+    }
+
+    private static function normalizeNewLines($content)
+    {
+        // Split the content by lines
+        $lines = explode("\n", $content);
+
+        $processedLines = [];
+
+        for ($i = 0; $i < count($lines); $i++) {
+            $line = trim($lines[$i]);
+
+            // If the line is an image markdown
+            if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
+                // And if the next line is not empty and not another markdown structure
+                if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
+                    $line .= ' '.trim($lines[$i + 1]);
+                    $i++; // Skip the next line as we're merging it
+                }
+            }
+
+            // Add line to processedLines if it's not empty
+            if (! empty($line)) {
+                $processedLines[] = $line;
+            }
+        }
+
+        // Collapse excessive newlines
+        $result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
+
+        // Detect and replace the pattern
+        $result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
+
+        // Replace multiple spaces with a dash separator
+        $result = preg_replace('/ {2,}/', ' - ', $result);
+
+        return $result;
+    }
+
+    private static function cleanHtml($htmlContent)
+    {
+        $crawler = new Crawler($htmlContent);
+
+        // Define tags to remove completely
+        $tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
+
+        foreach ($tagsToRemove as $tag) {
+            $crawler->filter($tag)->each(function ($node) {
+                foreach ($node as $child) {
+                    $child->parentNode->removeChild($child);
+                }
+            });
+        }
+
+        // Replace <span> tags with their inner content
+        $crawler->filter('span')->each(function ($node) {
+            $replacement = new \DOMText($node->text());
+
+            foreach ($node as $child) {
+                $child->parentNode->replaceChild($replacement, $child);
+            }
+        });
+
+        return $crawler->outerHtml();
+    }
+}
--- a/app/Jobs/Tasks/FillPostMetadataTask.php
+++ b/app/Jobs/Tasks/FillPostMetadataTask.php
@@ -0,0 +1,267 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Helpers\FirstParty\OpenAI\OpenAI;
+use App\Helpers\FirstParty\OSSUploader\OSSUploader;
+use App\Jobs\SchedulePublishPost;
+use App\Models\Entity;
+use App\Models\PostEntity;
+use App\Models\Category;
+use App\Models\Post;
+use App\Models\PostCategory;
+use App\Models\SerpUrlResearch;
+use App\Models\ServiceCostUsage;
+use Exception;
+use Illuminate\Support\Facades\Http;
+use Image;
+
+class FillPostMetadataTask
+{
+    public static function handle(int $post_id)
+    {
+        $post = Post::find($post_id);
+
+        if (is_null($post)) {
+            return;
+        }
+
+        if (! is_null($post->metadata)) {
+            $post_meta_response = $post->metadata;
+        } else {
+            $post_meta_response = OpenAI::getArticleMeta($post->body, 1536, 30);
+
+            if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
+                $service_cost_usage = new ServiceCostUsage;
+                $service_cost_usage->cost = $post_meta_response->cost;
+                $service_cost_usage->name = 'openai-getArticleMeta';
+                $service_cost_usage->reference_1 = 'post';
+                $service_cost_usage->reference_2 = strval($post->id);
+                $service_cost_usage->output = $post_meta_response;
+                $service_cost_usage->save();
+            }
+        }
+
+        //dump($post_meta_response);
+
+        if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
+
+            $post->metadata = $post_meta_response;
+
+            if (isset($post_meta_response->output->keywords)) {
+                if (count($post_meta_response->output->keywords) > 0) {
+                    $post->keywords = $post_meta_response->output->keywords;
+
+                    $post->main_keyword = $post_meta_response->output->keywords[0];
+                }
+            }
+
+            if (isset($post_meta_response->output->title)) {
+                if ((is_empty($post->title)) && (! is_empty($post_meta_response->output->title))) {
+                    $post->title = $post_meta_response->output->title;
+                }
+            }
+
+            if (isset($post_meta_response->output->summary)) {
+                if (! is_empty($post_meta_response->output->summary)) {
+                    $post->bites = $post_meta_response->output->summary;
+                }
+            }
+
+        }
+
+        if (is_empty($post->slug)) {
+            $post->slug = str_slug($post->title);
+        }
+
+        if (is_empty($post->featured_image)) {
+            $post = self::setPostImage($post);
+        }
+
+        if (isset($post_meta_response->output->society_impact))
+        {
+          if (!is_empty($post_meta_response->output->society_impact))
+          {
+            $post->society_impact = $post_meta_response->output->society_impact;
+          } 
+        }
+
+        if (isset($post_meta_response->output->society_impact_level))
+        {
+          if (!is_empty($post_meta_response->output->society_impact_level))
+          {
+            $post->society_impact = $post_meta_response->output->society_impact_level;
+          } 
+        }
+
+        if ($post->save()) {
+            
+
+          // Set Category
+
+          $category_name = 'Updates';
+
+          if ((isset($post_meta_response->output->category)) && (!is_empty($post_meta_response->output->category)))
+          {
+            $category_name = $post_meta_response?->output?->category;
+          }
+
+          $category = Category::where('name', $category_name)->first();
+
+          if (is_null($category))
+          {
+            $category = Category::where('name', 'Updates')->first();
+          }
+
+          // Set Post Category
+          $post_category = PostCategory::where('post_id', $post->id)->first();
+
+          if (is_null($post_category))
+          {
+            $post_category = new PostCategory;
+            $post_category->post_id = $post->id;
+          }
+          $post_category->category_id = $category->id;
+
+          $post_category->save();
+
+
+          // Set Post Entities
+          if (isset($post_meta_response->output->entities))
+          {
+            $entity_names = [];
+
+            if (is_array($post_meta_response->output->entities))
+            {
+              $entity_names = $post_meta_response->output->entities;
+            }
+
+            if (count($entity_names) > 0)
+            {
+              $previous_post_entities = PostEntity::where('post_id', $post->id)->delete();
+
+              foreach ($entity_names as $entity_name)
+              {
+                $entity_name = trim($entity_name);
+                
+                $entity = Entity::where('name', $entity_name)->first();
+
+                if (is_null($entity))
+                {
+                  $entity = new Entity;
+                  $entity->name = $entity_name;
+                  $entity->slug = str_slug($entity_name);
+                  $entity->save();
+                }
+
+
+                $post_entity = PostEntity::where('post_id', $post->id)
+                                        ->where('entity_id', $entity->id)
+                                        ->first();
+
+                if (is_null($post_entity))
+                {
+                  $post_entity = new PostEntity;
+                  $post_entity->post_id = $post->id;
+                  $post_entity->entity_id = $entity->id;
+                  $post_entity->save();
+                }
+              }
+            }
+          }
+
+          // Set Schedule Publish
+          SchedulePublishPost::dispatch($post->id, 'future')->onQueue('default')->onConnection('default');
+        }
+
+    }
+
+    private static function setPostImage($post)
+    {
+        $serp_url_researches = SerpUrlResearch::where('serp_url_id', $post->serp_url_id)->get();
+
+        $main_image_url = null;
+
+        foreach ($serp_url_researches as $serp_url_research) {
+            if (! is_empty($serp_url_research->main_image)) {
+                if (is_valid_url($serp_url_research->main_image)) {
+
+                    if (is_empty($serp_url_research->main_image)) {
+                        continue;
+                    }
+
+                    $main_image_url = $serp_url_research->main_image;
+
+                    $image_response = Http::timeout(300)->withHeaders([
+                        'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+                    ])->get($main_image_url);
+
+                    $image_content = $image_response->body();
+
+                    // Get the size of the image content in KB
+                    $imageSizeInKb = strlen($image_response->body()) / 1024;
+
+                    // Skip this iteration if the image exceeds the maximum size
+                    if ($imageSizeInKb > 1024) {
+                        continue;
+                    }
+
+                    $canvas_width = 1080;
+                    $canvas_height = 608;
+
+                    $thumb_width = 540;
+                    $thumb_height = 78;
+
+                    // Create an image from the fetched content
+                    $image = Image::make($image_content);
+
+                    // Check if the image is wider than it is tall (landscape orientation)
+                    if ($image->width() > $image->height()) {
+                        // Resize the image to fill the canvas width while maintaining aspect ratio
+                        $image->resize($canvas_width, null, function ($constraint) {
+                            $constraint->aspectRatio();
+                        });
+                    } else {
+                        // Resize the image to fill the canvas height while maintaining aspect ratio
+                        $image->resize(null, $canvas_height, function ($constraint) {
+                            $constraint->aspectRatio();
+                        });
+                    }
+
+                    // Fit the image to the canvas size, without gaps
+                    $image->fit($canvas_width, $canvas_height, function ($constraint) {
+                        $constraint->upsize();
+                    });
+
+                    // Create a thumbnail by cloning the original image and resizing it
+                    $thumb = clone $image;
+                    $thumb->resize($thumb_width, $thumb_height, function ($constraint) {
+                        $constraint->aspectRatio();
+                        $constraint->upsize();
+                    });
+
+                    // Create a image filename
+                    $epoch_now_timestamp = epoch_now_timestamp();
+                    $filename = $post->slug.'-'.$epoch_now_timestamp.'.jpg';
+                    $thumb_filename = $post->slug.'-'.$epoch_now_timestamp.'_thumb.jpg';
+
+                    OSSUploader::uploadFile('r2', 'post_images_2/', $filename, (string) $image->stream('jpeg', 75));
+                    OSSUploader::uploadFile('r2', 'post_images_2/', $thumb_filename, (string) $thumb->stream('jpeg', 50));
+
+                    $post->featured_image = 'post_images_2/'.$filename;
+
+                    $image->destroy();
+
+                    try {
+                        break;
+                    } catch (Exception $e) {
+                        continue;
+                    }
+
+                }
+            }
+        }
+
+        return $post;
+    }
+}
--- a/app/Jobs/Tasks/GenerateArticleFeaturedImageTask.php
+++ b/app/Jobs/Tasks/GenerateArticleFeaturedImageTask.php
@@ -94,7 +94,7 @@ public static function handle($post)
                    }
                }

-                //return $news_serp_result;
+            //return $news_serp_result;
            } else {
                throw new Exception('Uploading failed', 1);
            }
--- a/app/Jobs/Tasks/IdentifyCrawlSourcesTask.php
+++ b/app/Jobs/Tasks/IdentifyCrawlSourcesTask.php
@@ -0,0 +1,54 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Helpers\FirstParty\OpenAI\OpenAI;
+use App\Jobs\BrowseDFSForResearchJob;
+use App\Models\SerpUrl;
+use App\Models\ServiceCostUsage;
+
+class IdentifyCrawlSourcesTask
+{
+    public static function handle(int $serp_url_id)
+    {
+        $serp_url = SerpUrl::find($serp_url_id);
+
+        if (! is_null($serp_url)) {
+
+            $attempt = 0;
+            $maxAttempts = 3;
+            $suggestion_response = null;
+
+            while ($attempt < $maxAttempts && ($suggestion_response === null || $suggestion_response->output === null)) {
+                $suggestion_response = OpenAI::titleSuggestions($serp_url->title);
+
+                //dump($suggestion_response);
+
+                $service_cost_usage = new ServiceCostUsage;
+                $service_cost_usage->cost = $suggestion_response->cost;
+                $service_cost_usage->name = 'openai-titleSuggestions';
+                $service_cost_usage->reference_1 = 'serp_url';
+                $service_cost_usage->reference_2 = strval($serp_url->id);
+                $service_cost_usage->output = $suggestion_response;
+                $service_cost_usage->save();
+
+                $attempt++;
+
+                // If the output is not null, break out of the loop
+                if ($suggestion_response !== null && $suggestion_response->output !== null) {
+                    break;
+                }
+
+                // Optional: sleep for a bit before retrying
+                sleep(1); // sleep for 1 second
+            }
+
+            if (! is_null($suggestion_response->output)) {
+                $serp_url->suggestion_data = $suggestion_response->output;
+                if ($serp_url->save()) {
+                    BrowseDFSForResearchJob::dispatch($serp_url_id)->onQueue('default')->onConnection('default');
+                }
+            }
+        }
+    }
+}
--- a/app/Jobs/Tasks/ParseNewsSerpDomainsTask.php
+++ b/app/Jobs/Tasks/ParseNewsSerpDomainsTask.php
@@ -2,15 +2,19 @@

 namespace App\Jobs\Tasks;

+use App\Helpers\FirstParty\OpenAI\OpenAI;
 use App\Helpers\FirstParty\OSSUploader\OSSUploader;
+use App\Jobs\IdentifyCrawlSourcesJob;
 use App\Models\Category;
 use App\Models\NewsSerpResult;
 use App\Models\SerpUrl;
+use App\Models\ServiceCostUsage;
+use Carbon\Carbon;
 use Exception;

-class ParseNewsSerpDomainsTask
+class ParseDFSNewsTask
 {
-    public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1)
+    public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 100)
    {
        //dd($news_serp_result->category->serp_at);

@@ -35,16 +39,47 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1

            foreach ($serp_results as $serp_item) {

+                if ($serp_item->type != 'news_search') {
+                    continue;
+                }
+
                //dump($serp_item);

                if (is_empty($serp_item->url)) {
                    continue;
                }

-                // if (!str_contains($serp_item->time_published, "hours"))
-                // {
-                //   continue;
-                // }
+                $blacklist_keywords = config('platform.global.blacklist_keywords_serp');
+
+                $blacklist_domains = config('platform.global.blacklist_domains_serp');
+
+                $skipItem = false;
+
+                foreach ($blacklist_domains as $domain) {
+                    if (str_contains($serp_item->domain, $domain)) {
+                        $skipItem = true;
+                        break;
+                    }
+                }
+
+                if (! $skipItem) {
+                    $title = strtolower($serp_item->title);
+                    $snippet = strtolower($serp_item->snippet);
+
+                    // Check if any unwanted word is in the title or snippet
+
+                    foreach ($blacklist_keywords as $word) {
+                        if (strpos($title, $word) !== false || strpos($snippet, $word) !== false) {
+                            $skipItem = true;
+                            break; // Break the inner loop as we found an unwanted word
+                        }
+                    }
+                }
+
+                // Skip this iteration if an unwanted word was found
+                if ($skipItem) {
+                    continue;
+                }

                $serp_url = SerpUrl::where('url', $serp_item->url)->first();

@@ -69,14 +104,14 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1

            //dd($valid_serps);

+            $serp_titles = [];
+
            foreach ($valid_serps as $serp_item) {

-                //dd($serp_item);
+                $serp_url = SerpUrl::where('url', self::normalizeUrl($serp_item->url))->first();

                if (is_null($serp_url)) {
                    $serp_url = new SerpUrl;
-                    $serp_url->category_id = $news_serp_result->category_id;
-                    $serp_url->category_name = $news_serp_result->category_name;
                    $serp_url->news_serp_result_id = $news_serp_result->id;
                }

@@ -85,20 +120,47 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
                $serp_url->country_iso = $news_serp_result->serp_country_iso;

                if (! is_empty($serp_item->title)) {
-                    $serp_url->title = $serp_item->title;
+                    $serp_url->title = remove_newline($serp_item->title);
                }

                if (! is_empty($serp_item->snippet)) {
-                    $serp_url->description = $serp_item->snippet;
+                    $serp_url->description = remove_newline($serp_item->snippet);
                }

                if ($serp_url->isDirty()) {
-                    $serp_url->serp_at = $news_serp_result->category->serp_at;
+                    $serp_url->serp_at = now();
+                }
+
+                if ((isset($serp_item->timestamp)) && (! is_empty($serp_item->timestamp))) {
+                    $serp_url->url_posted_at = Carbon::parse($serp_item->timestamp);
+                } else {
+                    $serp_url->url_posted_at = now();
                }

                if ($serp_url->save()) {
                    $success = true;
                }
+                $serp_titles[$serp_url->id] = $serp_url->title;
+
+            }
+
+            $ids_response = OpenAI::topTitlePicksById(json_encode($serp_titles));
+
+            if (isset($ids_response->output->ids)) {
+
+                $service_cost_usage = new ServiceCostUsage;
+                $service_cost_usage->cost = $ids_response->cost;
+                $service_cost_usage->name = 'openai-topTitlePicksById';
+                $service_cost_usage->reference_1 = 'news_serp_result';
+                $service_cost_usage->reference_2 = strval($news_serp_result->id);
+                $service_cost_usage->output = $ids_response;
+                $service_cost_usage->save();
+
+                $selected_serp_urls = SerpUrl::whereIn('id', $ids_response->output->ids)->update(['picked' => true]);
+
+                foreach ($ids_response->output->ids as $id) {
+                    IdentifyCrawlSourcesJob::dispatch($id)->onQueue('default')->onConnection('default');
+                }
            }
        }

--- a/app/Jobs/Tasks/PublishIndexPostTask.php
+++ b/app/Jobs/Tasks/PublishIndexPostTask.php
@@ -9,12 +9,19 @@

 class PublishIndexPostTask
 {
-    public static function handle(Post $post)
+    public static function handle(int $post_id)
    {
-        $post->published_at = now();
+        $post = Post::find($post_id);
+
+        if (is_null($post))
+        {
+          return ;
+        }
+
+        $post->status = 'publish';

        if ($post->save()) {
-            if (app()->environment() == 'production') {
+            if ((app()->environment() == 'production') && (config('platform.global.indexing'))) {
                $post_url = route('front.post', ['slug' => $post->slug, 'category_slug' => $post->category->slug]);

                try {
--- a/app/Jobs/Tasks/SchedulePublishTask.php
+++ b/app/Jobs/Tasks/SchedulePublishTask.php
@@ -0,0 +1,76 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Models\Post;
+use App\Notifications\PostIncomplete;
+use Notification;
+
+class SchedulePublishTask
+{
+    public static function handle($post_id, $post_status = 'publish')
+    {
+        sleep(2);
+
+        $post = Post::find($post_id);
+
+        if (is_null($post)) {
+            return;
+        }
+
+        if (! in_array($post->status, ['future', 'draft', 'publish'])) {
+            return;
+        }
+
+        if ((is_empty($post->title)) || (is_empty($post->slug)) || (is_empty($post->main_keyword)) || (is_empty($post->keywords)) || (is_empty($post->bites)) || (is_empty($post->featured_image)) || (is_empty($post->body)) || (is_empty($post->metadata))) {
+            Notification::route(get_notification_channel(), get_notification_user_id())->notify(new PostIncomplete($post));
+
+            return;
+        }
+
+        /*
+        TODO:
+
+        - to determine a published_at time, first check if there are any post with existing published_at date.
+
+        - if there are no other posts except for the current post, then the current post published_at is now().
+
+        - if there are other posts but all of them published_at is null, then the current post published_at is now().
+
+        - if there are other posts and there are non null published_at,
+        -- first find the latest published post (latest published_at).
+        -- if the latest published_at datetime is before now, then published_at is null.
+        -- if the latest published_at datetime is after now, then current post published_at should be 1 hour after the latest published_at
+
+        -- the idea is published_posts should be spreaded accross by an hour if found.
+
+        */
+
+        // Check if there are any other posts with a set published_at date
+        $latest_published_post = Post::where('id', '!=', $post_id)->whereNotNull('published_at')->orderBy('published_at', 'DESC')->first();
+
+        //dd($latest_published_post);
+
+        if (is_null($latest_published_post)) {
+            $post->published_at = now();
+        } else {
+            if ($latest_published_post->published_at->lt(now())) {
+
+                $new_time = now();
+
+            } else {
+
+                $new_time = clone $latest_published_post->published_at;
+
+            }
+
+            $new_time->addMinutes(rand(40, 60));
+            $post->published_at = $new_time;
+        }
+
+        $post->published_at->subDays(1);
+
+        $post->status = $post_status; // Assuming you want to update the status to future
+        $post->save();
+    }
+}
--- a/app/Jobs/Tasks/WriteWithAITask.php
+++ b/app/Jobs/Tasks/WriteWithAITask.php
@@ -0,0 +1,140 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Helpers\FirstParty\OpenAI\OpenAI;
+use App\Jobs\FillPostMetadataJob;
+use App\Models\Post;
+use App\Models\SerpUrl;
+use App\Models\SerpUrlResearch;
+use App\Models\ServiceCostUsage;
+use Exception;
+use Mis3085\Tiktoken\Facades\Tiktoken;
+
+class WriteWithAITask
+{
+    public static function handle(int $serp_url_id)
+    {
+        $serp_url = SerpUrl::find($serp_url_id);
+
+        if (is_null($serp_url)) {
+            return;
+        }
+
+        $serp_url_researches = SerpUrlResearch::where('serp_url_id', $serp_url->id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->get();
+
+        $user_prompt = '';
+        $total_tokens = 0;
+
+        foreach ($serp_url_researches as $serp_url_research) {
+
+            $sentences = self::markdownToSentences($serp_url_research->content);
+
+            //dump($sentences);
+
+            foreach ($sentences as $key => $sentence) {
+
+                if ($key == 0) {
+                    $user_prompt .= "ARTICLE:\n";
+                }
+
+                $current_tokens = Tiktoken::count($sentence);
+
+                if ($current_tokens + $total_tokens > 4096) {
+                    break 2;
+                } else {
+                    $user_prompt .= $sentence."\n";
+                    $total_tokens += $current_tokens;
+                }
+
+            }
+            $user_prompt .= "\n\n";
+        }
+
+        //dd($user_prompt);
+
+        $ai_writeup_response = OpenAI::writeArticle($user_prompt, 1536, 30);
+
+        //dd($ai_writeup_response);
+
+        if ((isset($ai_writeup_response->output)) && (! is_empty($ai_writeup_response->output))) {
+            $output = self::extractRemoveFirstHeading($ai_writeup_response->output);
+
+            $service_cost_usage = new ServiceCostUsage;
+            $service_cost_usage->cost = $ai_writeup_response->cost;
+            $service_cost_usage->name = 'openai-writeArticle';
+            $service_cost_usage->reference_1 = 'serp_url';
+            $service_cost_usage->reference_2 = strval($serp_url->id);
+            $service_cost_usage->output = $ai_writeup_response;
+            $service_cost_usage->save();
+
+            $post = Post::where('serp_url_id', $serp_url->id)->first();
+
+            if (is_null($post)) {
+                $post = new Post;
+                $post->serp_url_id = $serp_url->id;
+            }
+
+            if (! is_empty($output->title)) {
+                $post->title = $output->title;
+            } else {
+
+                if (! is_null($serp_url->suggestion_data)) {
+                    if (isset($serp_url->suggestion_data->article_headings)) {
+                        if (count($serp_url->suggestion_data->article_headings) > 0) {
+                            $post->title = $serp_url->suggestion_data?->article_headings[0];
+                        }
+                    }
+                }
+            }
+
+            if (is_empty($post->title)) {
+                $post->title = $serp_url->title;
+            }
+
+            $post->slug = str_slug($post->title);
+
+            $post->body = $output->content;
+
+            $post->bites = null;
+            $post->metadata = null;
+
+            if ($post->save()) {
+                FillPostMetadataJob::dispatch($post->id)->onQueue('default')->onConnection('default');
+            }
+        } else {
+            throw new Exception('OpenAI failed to write');
+        }
+    }
+
+    private static function markdownToSentences($markdownContent)
+    {
+        // Split the content on punctuation followed by a space or end of string
+        $pattern = '/(?<=[.!?])\s+|\z/';
+
+        // Split the content into sentences
+        $sentences = preg_split($pattern, $markdownContent, -1, PREG_SPLIT_NO_EMPTY);
+
+        // Return the array of sentences
+        return $sentences;
+    }
+
+    private static function extractRemoveFirstHeading($markdownContent)
+    {
+        // Pattern to match the first markdown heading of any level
+        $pattern = '/^(#+)\s*(.+)$/m';
+
+        // Try to find the first heading
+        if (preg_match($pattern, $markdownContent, $matches)) {
+            $title = $matches[2]; // The first heading becomes the title
+
+            // Remove the first heading from the content
+            $updatedContent = preg_replace($pattern, '', $markdownContent, 1);
+
+            return (object) ['title' => $title, 'content' => trim($updatedContent)];
+        }
+
+        // Return original content if no heading found
+        return (object) ['title' => '', 'content' => $markdownContent];
+    }
+}