category->serp_at); $serp_results = null; $success = false; try { $serp_results = OSSUploader::readJson( config('platform.dataset.news.news_serp.driver'), config('platform.dataset.news.news_serp.path'), $news_serp_result->filename)?->tasks[0]?->result[0]?->items; } catch (Exception $e) { $serp_results = null; } if (! is_null($serp_results)) { $valid_serps = []; foreach ($serp_results as $serp_item) { if ($serp_item->type != 'news_search') { continue; } //dump($serp_item); if (is_empty($serp_item->url)) { continue; } $blacklist_keywords = config('platform.global.blacklist_keywords_serp'); $blacklist_domains = config('platform.global.blacklist_domains_serp'); $skipItem = false; foreach ($blacklist_domains as $domain) { if (str_contains($serp_item->domain, $domain)) { $skipItem = true; break; } } if (! $skipItem) { $title = strtolower($serp_item->title); $snippet = strtolower($serp_item->snippet); // Check if any unwanted word is in the title or snippet foreach ($blacklist_keywords as $word) { if (strpos($title, $word) !== false || strpos($snippet, $word) !== false) { $skipItem = true; break; // Break the inner loop as we found an unwanted word } } } // Skip this iteration if an unwanted word was found if ($skipItem) { continue; } $serp_url = SerpUrl::where('url', $serp_item->url)->first(); if (! is_null($serp_url)) { if ($serp_url->status == 'blocked') { continue; } } if (str_contains($serp_item->title, ':')) { continue; } $valid_serps[] = $serp_item; if (count($valid_serps) >= $serp_counts) { break; } } //dd($valid_serps); $serp_titles = []; foreach ($valid_serps as $serp_item) { $serp_url = SerpUrl::where('url', self::normalizeUrl($serp_item->url))->first(); if (is_null($serp_url)) { $serp_url = new SerpUrl; $serp_url->news_serp_result_id = $news_serp_result->id; } $serp_url->source = 'serp'; $serp_url->url = self::normalizeUrl($serp_item->url); $serp_url->country_iso = $news_serp_result->serp_country_iso; if (! is_empty($serp_item->title)) { $serp_url->title = remove_newline($serp_item->title); } if (! is_empty($serp_item->snippet)) { $serp_url->description = remove_newline($serp_item->snippet); } if ($serp_url->isDirty()) { $serp_url->serp_at = now(); } if ((isset($serp_item->timestamp)) && (! is_empty($serp_item->timestamp))) { $serp_url->url_posted_at = Carbon::parse($serp_item->timestamp); } else { $serp_url->url_posted_at = now(); } if ($serp_url->save()) { $success = true; } $serp_titles[$serp_url->id] = $serp_url->title; } $ids_response = OpenAI::topTitlePicksById(json_encode($serp_titles)); if (isset($ids_response->output->ids)) { $service_cost_usage = new ServiceCostUsage; $service_cost_usage->cost = $ids_response->cost; $service_cost_usage->name = 'openai-topTitlePicksById'; $service_cost_usage->reference_1 = 'news_serp_result'; $service_cost_usage->reference_2 = strval($news_serp_result->id); $service_cost_usage->output = $ids_response; $service_cost_usage->save(); $selected_serp_urls = SerpUrl::whereIn('id', $ids_response->output->ids)->update(['picked' => true]); foreach ($ids_response->output->ids as $id) { IdentifyCrawlSourcesJob::dispatch($id)->onQueue('default')->onConnection('default'); } } } return $success; } private static function normalizeUrl($url) { try { $parsedUrl = parse_url($url); // Force the scheme to https to avoid duplicate content issues $parsedUrl['scheme'] = 'https'; if (! isset($parsedUrl['host'])) { // If the host is not present, throw an exception throw new \Exception('Host not found in URL'); } // Check if the path is set and ends with a trailing slash, if so, remove it if (isset($parsedUrl['path']) && substr($parsedUrl['path'], -1) === '/') { $parsedUrl['path'] = rtrim($parsedUrl['path'], '/'); } // Remove query parameters unset($parsedUrl['query']); $normalizedUrl = sprintf( '%s://%s%s', $parsedUrl['scheme'], $parsedUrl['host'], $parsedUrl['path'] ?? '' ); // Remove fragment if exists $normalizedUrl = preg_replace('/#.*$/', '', $normalizedUrl); return $normalizedUrl; } catch (\Exception $e) { // In case of an exception, return the original URL return $url; } } }