Files
futurewalker/app/Jobs/Tasks/ParseNewsSerpDomainsTask.php

147 lines
4.1 KiB
PHP

<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Models\Category;
use App\Models\NewsSerpResult;
use App\Models\SerpUrl;
use Carbon\Carbon;
use Exception;
class ParseNewsSerpDomainsTask
{
public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1)
{
//dd($news_serp_result->category->serp_at);
$serp_results = null;
$success = false;
try {
$serp_results = OSSUploader::readJson(
config('platform.dataset.news.news_serp.driver'),
config('platform.dataset.news.news_serp.path'),
$news_serp_result->filename)?->tasks[0]?->result[0]?->items;
} catch (Exception $e) {
$serp_results = null;
}
if (! is_null($serp_results)) {
$valid_serps = [];
foreach ($serp_results as $serp_item) {
$news_date = Carbon::parse($serp_item->timestamp);
if (is_empty($serp_item->url)) {
continue;
}
// if (!str_contains($serp_item->time_published, "hours"))
// {
// continue;
// }
$serp_url = SerpUrl::where('url', $serp_item->url)->first();
if (! is_null($serp_url)) {
if ($serp_url->status == 'blocked') {
continue;
}
}
if (str_contains($serp_item->title, ':')) {
continue;
}
$valid_serps[] = $serp_item;
if (count($valid_serps) >= $serp_counts) {
break;
}
}
//dd($valid_serps);
foreach ($valid_serps as $serp_item) {
//dd($serp_item);
if (is_null($serp_url)) {
$serp_url = new SerpUrl;
$serp_url->category_id = $news_serp_result->category_id;
$serp_url->category_name = $news_serp_result->category_name;
$serp_url->news_serp_result_id = $news_serp_result->id;
}
$serp_url->source = 'serp';
$serp_url->url = self::normalizeUrl($serp_item->url);
$serp_url->country_iso = $news_serp_result->serp_country_iso;
if (! is_empty($serp_item->title)) {
$serp_url->title = $serp_item->title;
}
if (! is_empty($serp_item->snippet)) {
$serp_url->description = $serp_item->snippet;
}
if ($serp_url->isDirty()) {
$serp_url->serp_at = $news_serp_result->category->serp_at;
}
if ($serp_url->save()) {
$success = true;
}
}
}
return $success;
}
private static function normalizeUrl($url)
{
try {
$parsedUrl = parse_url($url);
// Force the scheme to https to avoid duplicate content issues
$parsedUrl['scheme'] = 'https';
if (! isset($parsedUrl['host'])) {
// If the host is not present, throw an exception
throw new \Exception('Host not found in URL');
}
// Check if the path is set and ends with a trailing slash, if so, remove it
if (isset($parsedUrl['path']) && substr($parsedUrl['path'], -1) === '/') {
$parsedUrl['path'] = rtrim($parsedUrl['path'], '/');
}
// Remove query parameters
unset($parsedUrl['query']);
$normalizedUrl = sprintf(
'%s://%s%s',
$parsedUrl['scheme'],
$parsedUrl['host'],
$parsedUrl['path'] ?? ''
);
// Remove fragment if exists
$normalizedUrl = preg_replace('/#.*$/', '', $normalizedUrl);
return $normalizedUrl;
} catch (\Exception $e) {
// In case of an exception, return the original URL
return $url;
}
}
}