Add (initial): futurewalker code

2023-11-20 00:15:18 +08:00
parent f8602cb456
commit 9ce3e5c82a
166 changed files with 15941 additions and 1072 deletions
--- a/app/Jobs/Tasks/ParseDFSNewsTask.php
+++ b/app/Jobs/Tasks/ParseDFSNewsTask.php
@@ -0,0 +1,207 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Helpers\FirstParty\OpenAI\OpenAI;
+use App\Helpers\FirstParty\OSSUploader\OSSUploader;
+use App\Jobs\IdentifyCrawlSourcesJob;
+use App\Models\Category;
+use App\Models\NewsSerpResult;
+use App\Models\SerpUrl;
+use App\Models\ServiceCostUsage;
+use Carbon\Carbon;
+use Exception;
+
+class ParseDFSNewsTask
+{
+    public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 100)
+    {
+        //dd($news_serp_result->category->serp_at);
+
+        $serp_results = null;
+
+        $success = false;
+
+        try {
+
+            $serp_results = OSSUploader::readJson(
+                config('platform.dataset.news.news_serp.driver'),
+                config('platform.dataset.news.news_serp.path'),
+                $news_serp_result->filename)?->tasks[0]?->result[0]?->items;
+
+        } catch (Exception $e) {
+            $serp_results = null;
+        }
+
+        if (! is_null($serp_results)) {
+
+            $valid_serps = [];
+
+            foreach ($serp_results as $serp_item) {
+
+                if ($serp_item->type != 'news_search') {
+                    continue;
+                }
+
+                //dump($serp_item);
+
+                if (is_empty($serp_item->url)) {
+                    continue;
+                }
+
+                $blacklist_keywords = config('platform.global.blacklist_keywords_serp');
+
+                $blacklist_domains = config('platform.global.blacklist_domains_serp');
+
+                $skipItem = false;
+
+                foreach ($blacklist_domains as $domain) {
+                    if (str_contains($serp_item->domain, $domain)) {
+                        $skipItem = true;
+                        break;
+                    }
+                }
+
+                if (! $skipItem) {
+                    $title = strtolower($serp_item->title);
+                    $snippet = strtolower($serp_item->snippet);
+
+                    // Check if any unwanted word is in the title or snippet
+
+                    foreach ($blacklist_keywords as $word) {
+                        if (strpos($title, $word) !== false || strpos($snippet, $word) !== false) {
+                            $skipItem = true;
+                            break; // Break the inner loop as we found an unwanted word
+                        }
+                    }
+                }
+
+                // Skip this iteration if an unwanted word was found
+                if ($skipItem) {
+                    continue;
+                }
+
+                $serp_url = SerpUrl::where('url', $serp_item->url)->first();
+
+                if (! is_null($serp_url)) {
+                    if ($serp_url->status == 'blocked') {
+                        continue;
+                    }
+
+                }
+
+                if (str_contains($serp_item->title, ':')) {
+                    continue;
+                }
+
+                $valid_serps[] = $serp_item;
+
+                if (count($valid_serps) >= $serp_counts) {
+                    break;
+                }
+
+            }
+
+            //dd($valid_serps);
+
+            $serp_titles = [];
+
+            foreach ($valid_serps as $serp_item) {
+
+                $serp_url = SerpUrl::where('url', self::normalizeUrl($serp_item->url))->first();
+
+                if (is_null($serp_url)) {
+                    $serp_url = new SerpUrl;
+                    $serp_url->news_serp_result_id = $news_serp_result->id;
+                }
+
+                $serp_url->source = 'serp';
+                $serp_url->url = self::normalizeUrl($serp_item->url);
+                $serp_url->country_iso = $news_serp_result->serp_country_iso;
+
+                if (! is_empty($serp_item->title)) {
+                    $serp_url->title = remove_newline($serp_item->title);
+                }
+
+                if (! is_empty($serp_item->snippet)) {
+                    $serp_url->description = remove_newline($serp_item->snippet);
+                }
+
+                if ($serp_url->isDirty()) {
+                    $serp_url->serp_at = now();
+                }
+
+                if ((isset($serp_item->timestamp)) && (! is_empty($serp_item->timestamp))) {
+                    $serp_url->url_posted_at = Carbon::parse($serp_item->timestamp);
+                } else {
+                    $serp_url->url_posted_at = now();
+                }
+
+                if ($serp_url->save()) {
+                    $success = true;
+                }
+                $serp_titles[$serp_url->id] = $serp_url->title;
+
+            }
+
+            $ids_response = OpenAI::topTitlePicksById(json_encode($serp_titles));
+
+            if (isset($ids_response->output->ids)) {
+
+                $service_cost_usage = new ServiceCostUsage;
+                $service_cost_usage->cost = $ids_response->cost;
+                $service_cost_usage->name = 'openai-topTitlePicksById';
+                $service_cost_usage->reference_1 = 'news_serp_result';
+                $service_cost_usage->reference_2 = strval($news_serp_result->id);
+                $service_cost_usage->output = $ids_response;
+                $service_cost_usage->save();
+
+                $selected_serp_urls = SerpUrl::whereIn('id', $ids_response->output->ids)->update(['picked' => true]);
+
+                foreach ($ids_response->output->ids as $id) {
+                    IdentifyCrawlSourcesJob::dispatch($id)->onQueue('default')->onConnection('default');
+                }
+            }
+        }
+
+        return $success;
+    }
+
+    private static function normalizeUrl($url)
+    {
+        try {
+            $parsedUrl = parse_url($url);
+
+            // Force the scheme to https to avoid duplicate content issues
+            $parsedUrl['scheme'] = 'https';
+
+            if (! isset($parsedUrl['host'])) {
+                // If the host is not present, throw an exception
+                throw new \Exception('Host not found in URL');
+            }
+
+            // Check if the path is set and ends with a trailing slash, if so, remove it
+            if (isset($parsedUrl['path']) && substr($parsedUrl['path'], -1) === '/') {
+                $parsedUrl['path'] = rtrim($parsedUrl['path'], '/');
+            }
+
+            // Remove query parameters
+            unset($parsedUrl['query']);
+
+            $normalizedUrl = sprintf(
+                '%s://%s%s',
+                $parsedUrl['scheme'],
+                $parsedUrl['host'],
+                $parsedUrl['path'] ?? ''
+            );
+
+            // Remove fragment if exists
+            $normalizedUrl = preg_replace('/#.*$/', '', $normalizedUrl);
+
+            return $normalizedUrl;
+        } catch (\Exception $e) {
+            // In case of an exception, return the original URL
+            return $url;
+        }
+    }
+}