Add (initial): futurewalker code
This commit is contained in:
207
app/Jobs/Tasks/ParseDFSNewsTask.php
Normal file
207
app/Jobs/Tasks/ParseDFSNewsTask.php
Normal file
@@ -0,0 +1,207 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Jobs\IdentifyCrawlSourcesJob;
|
||||
use App\Models\Category;
|
||||
use App\Models\NewsSerpResult;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use Carbon\Carbon;
|
||||
use Exception;
|
||||
|
||||
class ParseDFSNewsTask
|
||||
{
|
||||
public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 100)
|
||||
{
|
||||
//dd($news_serp_result->category->serp_at);
|
||||
|
||||
$serp_results = null;
|
||||
|
||||
$success = false;
|
||||
|
||||
try {
|
||||
|
||||
$serp_results = OSSUploader::readJson(
|
||||
config('platform.dataset.news.news_serp.driver'),
|
||||
config('platform.dataset.news.news_serp.path'),
|
||||
$news_serp_result->filename)?->tasks[0]?->result[0]?->items;
|
||||
|
||||
} catch (Exception $e) {
|
||||
$serp_results = null;
|
||||
}
|
||||
|
||||
if (! is_null($serp_results)) {
|
||||
|
||||
$valid_serps = [];
|
||||
|
||||
foreach ($serp_results as $serp_item) {
|
||||
|
||||
if ($serp_item->type != 'news_search') {
|
||||
continue;
|
||||
}
|
||||
|
||||
//dump($serp_item);
|
||||
|
||||
if (is_empty($serp_item->url)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$blacklist_keywords = config('platform.global.blacklist_keywords_serp');
|
||||
|
||||
$blacklist_domains = config('platform.global.blacklist_domains_serp');
|
||||
|
||||
$skipItem = false;
|
||||
|
||||
foreach ($blacklist_domains as $domain) {
|
||||
if (str_contains($serp_item->domain, $domain)) {
|
||||
$skipItem = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (! $skipItem) {
|
||||
$title = strtolower($serp_item->title);
|
||||
$snippet = strtolower($serp_item->snippet);
|
||||
|
||||
// Check if any unwanted word is in the title or snippet
|
||||
|
||||
foreach ($blacklist_keywords as $word) {
|
||||
if (strpos($title, $word) !== false || strpos($snippet, $word) !== false) {
|
||||
$skipItem = true;
|
||||
break; // Break the inner loop as we found an unwanted word
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip this iteration if an unwanted word was found
|
||||
if ($skipItem) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$serp_url = SerpUrl::where('url', $serp_item->url)->first();
|
||||
|
||||
if (! is_null($serp_url)) {
|
||||
if ($serp_url->status == 'blocked') {
|
||||
continue;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (str_contains($serp_item->title, ':')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$valid_serps[] = $serp_item;
|
||||
|
||||
if (count($valid_serps) >= $serp_counts) {
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//dd($valid_serps);
|
||||
|
||||
$serp_titles = [];
|
||||
|
||||
foreach ($valid_serps as $serp_item) {
|
||||
|
||||
$serp_url = SerpUrl::where('url', self::normalizeUrl($serp_item->url))->first();
|
||||
|
||||
if (is_null($serp_url)) {
|
||||
$serp_url = new SerpUrl;
|
||||
$serp_url->news_serp_result_id = $news_serp_result->id;
|
||||
}
|
||||
|
||||
$serp_url->source = 'serp';
|
||||
$serp_url->url = self::normalizeUrl($serp_item->url);
|
||||
$serp_url->country_iso = $news_serp_result->serp_country_iso;
|
||||
|
||||
if (! is_empty($serp_item->title)) {
|
||||
$serp_url->title = remove_newline($serp_item->title);
|
||||
}
|
||||
|
||||
if (! is_empty($serp_item->snippet)) {
|
||||
$serp_url->description = remove_newline($serp_item->snippet);
|
||||
}
|
||||
|
||||
if ($serp_url->isDirty()) {
|
||||
$serp_url->serp_at = now();
|
||||
}
|
||||
|
||||
if ((isset($serp_item->timestamp)) && (! is_empty($serp_item->timestamp))) {
|
||||
$serp_url->url_posted_at = Carbon::parse($serp_item->timestamp);
|
||||
} else {
|
||||
$serp_url->url_posted_at = now();
|
||||
}
|
||||
|
||||
if ($serp_url->save()) {
|
||||
$success = true;
|
||||
}
|
||||
$serp_titles[$serp_url->id] = $serp_url->title;
|
||||
|
||||
}
|
||||
|
||||
$ids_response = OpenAI::topTitlePicksById(json_encode($serp_titles));
|
||||
|
||||
if (isset($ids_response->output->ids)) {
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $ids_response->cost;
|
||||
$service_cost_usage->name = 'openai-topTitlePicksById';
|
||||
$service_cost_usage->reference_1 = 'news_serp_result';
|
||||
$service_cost_usage->reference_2 = strval($news_serp_result->id);
|
||||
$service_cost_usage->output = $ids_response;
|
||||
$service_cost_usage->save();
|
||||
|
||||
$selected_serp_urls = SerpUrl::whereIn('id', $ids_response->output->ids)->update(['picked' => true]);
|
||||
|
||||
foreach ($ids_response->output->ids as $id) {
|
||||
IdentifyCrawlSourcesJob::dispatch($id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $success;
|
||||
}
|
||||
|
||||
private static function normalizeUrl($url)
|
||||
{
|
||||
try {
|
||||
$parsedUrl = parse_url($url);
|
||||
|
||||
// Force the scheme to https to avoid duplicate content issues
|
||||
$parsedUrl['scheme'] = 'https';
|
||||
|
||||
if (! isset($parsedUrl['host'])) {
|
||||
// If the host is not present, throw an exception
|
||||
throw new \Exception('Host not found in URL');
|
||||
}
|
||||
|
||||
// Check if the path is set and ends with a trailing slash, if so, remove it
|
||||
if (isset($parsedUrl['path']) && substr($parsedUrl['path'], -1) === '/') {
|
||||
$parsedUrl['path'] = rtrim($parsedUrl['path'], '/');
|
||||
}
|
||||
|
||||
// Remove query parameters
|
||||
unset($parsedUrl['query']);
|
||||
|
||||
$normalizedUrl = sprintf(
|
||||
'%s://%s%s',
|
||||
$parsedUrl['scheme'],
|
||||
$parsedUrl['host'],
|
||||
$parsedUrl['path'] ?? ''
|
||||
);
|
||||
|
||||
// Remove fragment if exists
|
||||
$normalizedUrl = preg_replace('/#.*$/', '', $normalizedUrl);
|
||||
|
||||
return $normalizedUrl;
|
||||
} catch (\Exception $e) {
|
||||
// In case of an exception, return the original URL
|
||||
return $url;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user