Add (initial): futurewalker code

This commit is contained in:
2023-11-20 00:15:18 +08:00
parent f8602cb456
commit 9ce3e5c82a
166 changed files with 15941 additions and 1072 deletions

View File

@@ -0,0 +1,132 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\DFS\DFSSerp;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Helpers\ThirdParty\DFS\SettingSerpLiveAdvanced;
use App\Jobs\CrawlUrlResearchJob;
use App\Models\SerpUrl;
use App\Models\SerpUrlResearch;
use App\Models\ServiceCostUsage;
use Exception;
class BrowseDFSForResearchTask
{
public static function handle(int $serp_url_id)
{
$serp_url = SerpUrl::find($serp_url_id);
if ((! is_null($serp_url)) && (! is_null($serp_url->suggestion_data))) {
if (isset($serp_url->suggestion_data->proposed_search_queries)) {
if (count($serp_url->suggestion_data->proposed_search_queries) > 0) {
$search_query = $serp_url->suggestion_data->proposed_search_queries[0];
// $serp_model = new SettingSerpLiveAdvanced;
// $serp_model->setSe('google');
// $serp_model->setSeType('organic');
// $serp_model->setKeyword(strtolower($search_query));
// $serp_model->setLocationName('United States');
// //$serp_model->setDepth(100);
// $serp_model->setLanguageCode('en');
// $serp_res = $serp_model->getAsJson();
// print_r($serp_res);
// die();
$country_name = get_country_name_by_iso($serp_url->country_iso);
$serp_res = DFSSerp::liveAdvanced('google', 'news', $search_query, $country_name, 'en', 100);
try {
$serp_obj = json_decode($serp_res, false, 512, JSON_THROW_ON_ERROR);
if ($serp_obj?->status_code == 20000) {
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $serp_obj->cost;
$service_cost_usage->name = 'dataforseo-GoogleSerpApiAdvancedLiveOrganic';
$service_cost_usage->reference_1 = 'google';
$service_cost_usage->reference_2 = 'organic';
$service_cost_usage->output = $serp_obj;
$service_cost_usage->input_1 = $country_name;
$service_cost_usage->input_2 = $search_query;
$service_cost_usage->save();
$results = $serp_obj?->tasks[0]->result[0]?->items;
//$results = $serp_obj?->result[0]?->items;
// dump($serp_obj);
// exit();
$saved_count = 0;
$first_serp_url_research = null;
foreach ($results as $key => $result) {
if ($result->type == 'news_search') {
$serp_url_research = SerpUrlResearch::where('url', $result->url)->where('serp_url_id', $serp_url_id)->first();
if (is_null($serp_url_research)) {
//dump($result->url);
$serp_url_research = new SerpUrlResearch;
$serp_url_research->serp_url_id = $serp_url_id;
$serp_url_research->url = $result->url;
$serp_url_research->query = $search_query;
$serp_url_research->content = null;
if ($serp_url_research->save()) {
$saved_count++;
}
}
}
if ($saved_count >= 10) {
break;
}
}
$first_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_id)->orderBy('created_at', 'ASC')->whereNull('content')->first();
CrawlUrlResearchJob::dispatch($first_serp_url_research->id)->onQueue('default')->onConnection('default');
}
} catch (Exception $e) {
throw $e;
}
}
}
}
}
}
// {
// "identified_keywords":[
// "Humane AI Pin",
// "costs",
// "OpenAI",
// "T-Mobile integration"
// ],
// "related_keywords":[
// "artificial intelligence device",
// "monthly subscription",
// "OpenAI partnership",
// "T-Mobile collaboration"
// ],
// "proposed_search_queries":[
// "Humane AI Pin features",
// "Cost of Humane AI Pin",
// "Humane AI Pin integration with OpenAI and T-Mobile",
// "Reviews of Humane AI Pin"
// ],
// "writing_tone":[
// "engaging",
// "informative"
// ],
// "article_headings":[
// "Introduction to Humane AI Pin",
// "Features of Humane AI Pin",
// "Cost and Subscription Details",
// "OpenAI and T-Mobile Integration"
// ]
// }

View File

@@ -2,46 +2,59 @@
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\DFS\DFSSerp;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Helpers\ThirdParty\DFS\SettingSerpLiveAdvanced;
use App\Models\Category;
use App\Models\NewsSerpResult;
use App\Models\ServiceCostUsage;
use DFSClientV3\DFSClient;
use Exception;
use Illuminate\Support\Facades\Log;
class GetNewsSerpTask
class BrowseDFSLatestNewsTask
{
public static function handle(Category $category, $country_iso)
public static function handle(string $keyword, $country_iso)
{
$country_name = get_country_name_by_iso($country_iso);
$keyword = strtolower("{$category->name}");
// $client = new DFSClient(
// config('dataforseo.login'),
// config('dataforseo.password'),
// config('dataforseo.timeout'),
// config('dataforseo.api_version'),
// config('dataforseo.url'),
// );
$client = new DFSClient(
config('dataforseo.login'),
config('dataforseo.password'),
config('dataforseo.timeout'),
config('dataforseo.api_version'),
config('dataforseo.url'),
);
// // You will receive SERP data specific to the indicated keyword, search engine, and location parameters
// $serp_model = new SettingSerpLiveAdvanced();
// You will receive SERP data specific to the indicated keyword, search engine, and location parameters
$serp_model = new SettingSerpLiveAdvanced();
// $serp_model->setSe('google');
// $serp_model->setSeType('news');
// $serp_model->setSearchParam('&tbs=qdr:d');
// $serp_model->setKeyword($keyword);
// $serp_model->setLocationName($country_name);
// $serp_model->setDepth(100);
// $serp_model->setLanguageCode('en');
// $serp_res = $serp_model->getAsJson();
$serp_model->setSe('google');
$serp_model->setSeType('news');
$serp_model->setKeyword($keyword);
$serp_model->setLocationName($country_name);
$serp_model->setDepth(100);
$serp_model->setLanguageCode('en');
$serp_res = $serp_model->getAsJson();
$serp_res = DFSSerp::liveAdvanced('google', 'news', $keyword, $country_name, 'en', 100, '&tbs=qdr:d');
try {
$serp_obj = json_decode($serp_res, false, 512, JSON_THROW_ON_ERROR);
if ($serp_obj?->status_code == 20000) {
$json_file_name = config('platform.dataset.news.news_serp.file_prefix').str_slug($category->name).'-'.epoch_now_timestamp().'.json';
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $serp_obj->cost;
$service_cost_usage->name = 'dataforseo-GoogleSerpApiAdvancedLiveNews';
$service_cost_usage->reference_1 = 'google';
$service_cost_usage->reference_2 = 'news';
$service_cost_usage->output = $serp_obj;
$service_cost_usage->input_1 = $country_name;
$service_cost_usage->input_2 = $keyword;
$service_cost_usage->save();
$json_file_name = config('platform.dataset.news.news_serp.file_prefix').str_slug($keyword).'-'.epoch_now_timestamp().'.json';
$upload_status = OSSUploader::uploadJson(
config('platform.dataset.news.news_serp.driver'),
@@ -50,9 +63,8 @@ public static function handle(Category $category, $country_iso)
$serp_obj);
if ($upload_status) {
$news_serp_result = new NewsSerpResult;
$news_serp_result->category_id = $category->id;
$news_serp_result->category_name = $category->name;
$news_serp_result->serp_provider = 'dfs';
$news_serp_result->serp_se = 'google';
$news_serp_result->serp_se_type = 'news';
@@ -62,10 +74,7 @@ public static function handle(Category $category, $country_iso)
$news_serp_result->result_count = $serp_obj?->tasks[0]?->result[0]?->items_count;
$news_serp_result->filename = $json_file_name;
$news_serp_result->status = 'initial';
if ($news_serp_result->save()) {
$category->serp_at = now();
$category->save();
}
$news_serp_result->save();
return $news_serp_result;
} else {

View File

@@ -0,0 +1,207 @@
<?php
namespace App\Jobs\Tasks;
use App\Jobs\CrawlUrlResearchJob;
use App\Jobs\WriteWithAIJob;
use App\Models\SerpUrl;
use App\Models\SerpUrlResearch;
use Exception;
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
use fivefilters\Readability\ParseException as ReadabilityParseException;
use fivefilters\Readability\Readability;
use Illuminate\Support\Facades\Http;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\Component\DomCrawler\Crawler;
class CrawlUrlResearchTask
{
public static function handle(int $serp_url_research_id)
{
$serp_url_research = SerpUrlResearch::find($serp_url_research_id);
if (is_null($serp_url_research)) {
return null;
}
try {
$user_agent = config('platform.proxy.user_agent');
$response = Http::withHeaders([
'User-Agent' => $user_agent,
])
->withOptions([
'proxy' => get_smartproxy_rotating_server(),
'timeout' => 10,
'verify' => false,
])
->get($serp_url_research->url);
if ($response->successful()) {
$raw_html = $response->body();
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
} else {
$raw_html = null;
$response->throw();
}
} catch (Exception $e) {
$raw_html = null;
//throw $e;
}
if (! is_empty($raw_html)) {
//dump(self::getMarkdownFromHtml($raw_html));
$serp_url_research->content = self::getMarkdownFromHtml($raw_html);
$serp_url_research->main_image = self::getMainImageFromHtml($raw_html);
//dump($serp_url_research->content);
} else {
$serp_url_research->content = 'EMPTY CONTENT';
}
$serp_url_research->save();
$completed_serp_url_researches_counts = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->count();
if ($completed_serp_url_researches_counts >= 3) {
$serp_url = SerpUrl::find($serp_url_research->serp_url_id);
if (! is_null($serp_url)) {
$serp_url->crawled = true;
$serp_url->save();
WriteWithAIJob::dispatch($serp_url->id)->onQueue('default')->onConnection('default');
}
} else {
$next_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->whereNull('content')->first();
if (! is_null($next_serp_url_research)) {
CrawlUrlResearchJob::dispatch($next_serp_url_research->id)->onQueue('default')->onConnection('default');
}
}
}
private static function getMainImageFromHtml($html)
{
$r_configuration = new ReadabilityConfiguration();
$r_configuration->setCharThreshold(20);
$readability = new Readability($r_configuration);
try {
$readability->parse($html);
return $readability->getImage();
//dd($readability);
} catch (ReadabilityParseException $e) {
}
return null;
}
private static function getMarkdownFromHtml($html)
{
$converter = new HtmlConverter([
'strip_tags' => true,
'strip_placeholder_links' => true,
]);
$html = self::cleanHtml($html);
$markdown = $converter->convert($html);
//dd($markdown);
$markdown = self::reverseLTGT($markdown);
$markdown = self::normalizeNewLines($markdown);
$markdown = self::removeDuplicateLines($markdown);
return html_entity_decode(markdown_to_plaintext($markdown));
}
private static function reverseLTGT($input)
{
$output = str_replace('&lt;', '<', $input);
$output = str_replace('&gt;', '>', $output);
return $output;
}
private static function removeDuplicateLines($string)
{
$lines = explode("\n", $string);
$uniqueLines = array_unique($lines);
return implode("\n", $uniqueLines);
}
private static function normalizeNewLines($content)
{
// Split the content by lines
$lines = explode("\n", $content);
$processedLines = [];
for ($i = 0; $i < count($lines); $i++) {
$line = trim($lines[$i]);
// If the line is an image markdown
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
// And if the next line is not empty and not another markdown structure
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
$line .= ' '.trim($lines[$i + 1]);
$i++; // Skip the next line as we're merging it
}
}
// Add line to processedLines if it's not empty
if (! empty($line)) {
$processedLines[] = $line;
}
}
// Collapse excessive newlines
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
// Detect and replace the pattern
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
// Replace multiple spaces with a dash separator
$result = preg_replace('/ {2,}/', ' - ', $result);
return $result;
}
private static function cleanHtml($htmlContent)
{
$crawler = new Crawler($htmlContent);
// Define tags to remove completely
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
foreach ($tagsToRemove as $tag) {
$crawler->filter($tag)->each(function ($node) {
foreach ($node as $child) {
$child->parentNode->removeChild($child);
}
});
}
// Replace <span> tags with their inner content
$crawler->filter('span')->each(function ($node) {
$replacement = new \DOMText($node->text());
foreach ($node as $child) {
$child->parentNode->replaceChild($replacement, $child);
}
});
return $crawler->outerHtml();
}
}

View File

@@ -0,0 +1,267 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Jobs\SchedulePublishPost;
use App\Models\Entity;
use App\Models\PostEntity;
use App\Models\Category;
use App\Models\Post;
use App\Models\PostCategory;
use App\Models\SerpUrlResearch;
use App\Models\ServiceCostUsage;
use Exception;
use Illuminate\Support\Facades\Http;
use Image;
class FillPostMetadataTask
{
public static function handle(int $post_id)
{
$post = Post::find($post_id);
if (is_null($post)) {
return;
}
if (! is_null($post->metadata)) {
$post_meta_response = $post->metadata;
} else {
$post_meta_response = OpenAI::getArticleMeta($post->body, 1536, 30);
if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $post_meta_response->cost;
$service_cost_usage->name = 'openai-getArticleMeta';
$service_cost_usage->reference_1 = 'post';
$service_cost_usage->reference_2 = strval($post->id);
$service_cost_usage->output = $post_meta_response;
$service_cost_usage->save();
}
}
//dump($post_meta_response);
if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
$post->metadata = $post_meta_response;
if (isset($post_meta_response->output->keywords)) {
if (count($post_meta_response->output->keywords) > 0) {
$post->keywords = $post_meta_response->output->keywords;
$post->main_keyword = $post_meta_response->output->keywords[0];
}
}
if (isset($post_meta_response->output->title)) {
if ((is_empty($post->title)) && (! is_empty($post_meta_response->output->title))) {
$post->title = $post_meta_response->output->title;
}
}
if (isset($post_meta_response->output->summary)) {
if (! is_empty($post_meta_response->output->summary)) {
$post->bites = $post_meta_response->output->summary;
}
}
}
if (is_empty($post->slug)) {
$post->slug = str_slug($post->title);
}
if (is_empty($post->featured_image)) {
$post = self::setPostImage($post);
}
if (isset($post_meta_response->output->society_impact))
{
if (!is_empty($post_meta_response->output->society_impact))
{
$post->society_impact = $post_meta_response->output->society_impact;
}
}
if (isset($post_meta_response->output->society_impact_level))
{
if (!is_empty($post_meta_response->output->society_impact_level))
{
$post->society_impact = $post_meta_response->output->society_impact_level;
}
}
if ($post->save()) {
// Set Category
$category_name = 'Updates';
if ((isset($post_meta_response->output->category)) && (!is_empty($post_meta_response->output->category)))
{
$category_name = $post_meta_response?->output?->category;
}
$category = Category::where('name', $category_name)->first();
if (is_null($category))
{
$category = Category::where('name', 'Updates')->first();
}
// Set Post Category
$post_category = PostCategory::where('post_id', $post->id)->first();
if (is_null($post_category))
{
$post_category = new PostCategory;
$post_category->post_id = $post->id;
}
$post_category->category_id = $category->id;
$post_category->save();
// Set Post Entities
if (isset($post_meta_response->output->entities))
{
$entity_names = [];
if (is_array($post_meta_response->output->entities))
{
$entity_names = $post_meta_response->output->entities;
}
if (count($entity_names) > 0)
{
$previous_post_entities = PostEntity::where('post_id', $post->id)->delete();
foreach ($entity_names as $entity_name)
{
$entity_name = trim($entity_name);
$entity = Entity::where('name', $entity_name)->first();
if (is_null($entity))
{
$entity = new Entity;
$entity->name = $entity_name;
$entity->slug = str_slug($entity_name);
$entity->save();
}
$post_entity = PostEntity::where('post_id', $post->id)
->where('entity_id', $entity->id)
->first();
if (is_null($post_entity))
{
$post_entity = new PostEntity;
$post_entity->post_id = $post->id;
$post_entity->entity_id = $entity->id;
$post_entity->save();
}
}
}
}
// Set Schedule Publish
SchedulePublishPost::dispatch($post->id, 'future')->onQueue('default')->onConnection('default');
}
}
private static function setPostImage($post)
{
$serp_url_researches = SerpUrlResearch::where('serp_url_id', $post->serp_url_id)->get();
$main_image_url = null;
foreach ($serp_url_researches as $serp_url_research) {
if (! is_empty($serp_url_research->main_image)) {
if (is_valid_url($serp_url_research->main_image)) {
if (is_empty($serp_url_research->main_image)) {
continue;
}
$main_image_url = $serp_url_research->main_image;
$image_response = Http::timeout(300)->withHeaders([
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
])->get($main_image_url);
$image_content = $image_response->body();
// Get the size of the image content in KB
$imageSizeInKb = strlen($image_response->body()) / 1024;
// Skip this iteration if the image exceeds the maximum size
if ($imageSizeInKb > 1024) {
continue;
}
$canvas_width = 1080;
$canvas_height = 608;
$thumb_width = 540;
$thumb_height = 78;
// Create an image from the fetched content
$image = Image::make($image_content);
// Check if the image is wider than it is tall (landscape orientation)
if ($image->width() > $image->height()) {
// Resize the image to fill the canvas width while maintaining aspect ratio
$image->resize($canvas_width, null, function ($constraint) {
$constraint->aspectRatio();
});
} else {
// Resize the image to fill the canvas height while maintaining aspect ratio
$image->resize(null, $canvas_height, function ($constraint) {
$constraint->aspectRatio();
});
}
// Fit the image to the canvas size, without gaps
$image->fit($canvas_width, $canvas_height, function ($constraint) {
$constraint->upsize();
});
// Create a thumbnail by cloning the original image and resizing it
$thumb = clone $image;
$thumb->resize($thumb_width, $thumb_height, function ($constraint) {
$constraint->aspectRatio();
$constraint->upsize();
});
// Create a image filename
$epoch_now_timestamp = epoch_now_timestamp();
$filename = $post->slug.'-'.$epoch_now_timestamp.'.jpg';
$thumb_filename = $post->slug.'-'.$epoch_now_timestamp.'_thumb.jpg';
OSSUploader::uploadFile('r2', 'post_images_2/', $filename, (string) $image->stream('jpeg', 75));
OSSUploader::uploadFile('r2', 'post_images_2/', $thumb_filename, (string) $thumb->stream('jpeg', 50));
$post->featured_image = 'post_images_2/'.$filename;
$image->destroy();
try {
break;
} catch (Exception $e) {
continue;
}
}
}
}
return $post;
}
}

View File

@@ -94,7 +94,7 @@ public static function handle($post)
}
}
//return $news_serp_result;
//return $news_serp_result;
} else {
throw new Exception('Uploading failed', 1);
}

View File

@@ -0,0 +1,54 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Jobs\BrowseDFSForResearchJob;
use App\Models\SerpUrl;
use App\Models\ServiceCostUsage;
class IdentifyCrawlSourcesTask
{
public static function handle(int $serp_url_id)
{
$serp_url = SerpUrl::find($serp_url_id);
if (! is_null($serp_url)) {
$attempt = 0;
$maxAttempts = 3;
$suggestion_response = null;
while ($attempt < $maxAttempts && ($suggestion_response === null || $suggestion_response->output === null)) {
$suggestion_response = OpenAI::titleSuggestions($serp_url->title);
//dump($suggestion_response);
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $suggestion_response->cost;
$service_cost_usage->name = 'openai-titleSuggestions';
$service_cost_usage->reference_1 = 'serp_url';
$service_cost_usage->reference_2 = strval($serp_url->id);
$service_cost_usage->output = $suggestion_response;
$service_cost_usage->save();
$attempt++;
// If the output is not null, break out of the loop
if ($suggestion_response !== null && $suggestion_response->output !== null) {
break;
}
// Optional: sleep for a bit before retrying
sleep(1); // sleep for 1 second
}
if (! is_null($suggestion_response->output)) {
$serp_url->suggestion_data = $suggestion_response->output;
if ($serp_url->save()) {
BrowseDFSForResearchJob::dispatch($serp_url_id)->onQueue('default')->onConnection('default');
}
}
}
}
}

View File

@@ -2,15 +2,19 @@
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use App\Jobs\IdentifyCrawlSourcesJob;
use App\Models\Category;
use App\Models\NewsSerpResult;
use App\Models\SerpUrl;
use App\Models\ServiceCostUsage;
use Carbon\Carbon;
use Exception;
class ParseNewsSerpDomainsTask
class ParseDFSNewsTask
{
public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1)
public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 100)
{
//dd($news_serp_result->category->serp_at);
@@ -35,16 +39,47 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
foreach ($serp_results as $serp_item) {
if ($serp_item->type != 'news_search') {
continue;
}
//dump($serp_item);
if (is_empty($serp_item->url)) {
continue;
}
// if (!str_contains($serp_item->time_published, "hours"))
// {
// continue;
// }
$blacklist_keywords = config('platform.global.blacklist_keywords_serp');
$blacklist_domains = config('platform.global.blacklist_domains_serp');
$skipItem = false;
foreach ($blacklist_domains as $domain) {
if (str_contains($serp_item->domain, $domain)) {
$skipItem = true;
break;
}
}
if (! $skipItem) {
$title = strtolower($serp_item->title);
$snippet = strtolower($serp_item->snippet);
// Check if any unwanted word is in the title or snippet
foreach ($blacklist_keywords as $word) {
if (strpos($title, $word) !== false || strpos($snippet, $word) !== false) {
$skipItem = true;
break; // Break the inner loop as we found an unwanted word
}
}
}
// Skip this iteration if an unwanted word was found
if ($skipItem) {
continue;
}
$serp_url = SerpUrl::where('url', $serp_item->url)->first();
@@ -69,14 +104,14 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
//dd($valid_serps);
$serp_titles = [];
foreach ($valid_serps as $serp_item) {
//dd($serp_item);
$serp_url = SerpUrl::where('url', self::normalizeUrl($serp_item->url))->first();
if (is_null($serp_url)) {
$serp_url = new SerpUrl;
$serp_url->category_id = $news_serp_result->category_id;
$serp_url->category_name = $news_serp_result->category_name;
$serp_url->news_serp_result_id = $news_serp_result->id;
}
@@ -85,20 +120,47 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
$serp_url->country_iso = $news_serp_result->serp_country_iso;
if (! is_empty($serp_item->title)) {
$serp_url->title = $serp_item->title;
$serp_url->title = remove_newline($serp_item->title);
}
if (! is_empty($serp_item->snippet)) {
$serp_url->description = $serp_item->snippet;
$serp_url->description = remove_newline($serp_item->snippet);
}
if ($serp_url->isDirty()) {
$serp_url->serp_at = $news_serp_result->category->serp_at;
$serp_url->serp_at = now();
}
if ((isset($serp_item->timestamp)) && (! is_empty($serp_item->timestamp))) {
$serp_url->url_posted_at = Carbon::parse($serp_item->timestamp);
} else {
$serp_url->url_posted_at = now();
}
if ($serp_url->save()) {
$success = true;
}
$serp_titles[$serp_url->id] = $serp_url->title;
}
$ids_response = OpenAI::topTitlePicksById(json_encode($serp_titles));
if (isset($ids_response->output->ids)) {
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $ids_response->cost;
$service_cost_usage->name = 'openai-topTitlePicksById';
$service_cost_usage->reference_1 = 'news_serp_result';
$service_cost_usage->reference_2 = strval($news_serp_result->id);
$service_cost_usage->output = $ids_response;
$service_cost_usage->save();
$selected_serp_urls = SerpUrl::whereIn('id', $ids_response->output->ids)->update(['picked' => true]);
foreach ($ids_response->output->ids as $id) {
IdentifyCrawlSourcesJob::dispatch($id)->onQueue('default')->onConnection('default');
}
}
}

View File

@@ -9,12 +9,19 @@
class PublishIndexPostTask
{
public static function handle(Post $post)
public static function handle(int $post_id)
{
$post->published_at = now();
$post = Post::find($post_id);
if (is_null($post))
{
return ;
}
$post->status = 'publish';
if ($post->save()) {
if (app()->environment() == 'production') {
if ((app()->environment() == 'production') && (config('platform.global.indexing'))) {
$post_url = route('front.post', ['slug' => $post->slug, 'category_slug' => $post->category->slug]);
try {

View File

@@ -0,0 +1,76 @@
<?php
namespace App\Jobs\Tasks;
use App\Models\Post;
use App\Notifications\PostIncomplete;
use Notification;
class SchedulePublishTask
{
public static function handle($post_id, $post_status = 'publish')
{
sleep(2);
$post = Post::find($post_id);
if (is_null($post)) {
return;
}
if (! in_array($post->status, ['future', 'draft', 'publish'])) {
return;
}
if ((is_empty($post->title)) || (is_empty($post->slug)) || (is_empty($post->main_keyword)) || (is_empty($post->keywords)) || (is_empty($post->bites)) || (is_empty($post->featured_image)) || (is_empty($post->body)) || (is_empty($post->metadata))) {
Notification::route(get_notification_channel(), get_notification_user_id())->notify(new PostIncomplete($post));
return;
}
/*
TODO:
- to determine a published_at time, first check if there are any post with existing published_at date.
- if there are no other posts except for the current post, then the current post published_at is now().
- if there are other posts but all of them published_at is null, then the current post published_at is now().
- if there are other posts and there are non null published_at,
-- first find the latest published post (latest published_at).
-- if the latest published_at datetime is before now, then published_at is null.
-- if the latest published_at datetime is after now, then current post published_at should be 1 hour after the latest published_at
-- the idea is published_posts should be spreaded accross by an hour if found.
*/
// Check if there are any other posts with a set published_at date
$latest_published_post = Post::where('id', '!=', $post_id)->whereNotNull('published_at')->orderBy('published_at', 'DESC')->first();
//dd($latest_published_post);
if (is_null($latest_published_post)) {
$post->published_at = now();
} else {
if ($latest_published_post->published_at->lt(now())) {
$new_time = now();
} else {
$new_time = clone $latest_published_post->published_at;
}
$new_time->addMinutes(rand(40, 60));
$post->published_at = $new_time;
}
$post->published_at->subDays(1);
$post->status = $post_status; // Assuming you want to update the status to future
$post->save();
}
}

View File

@@ -0,0 +1,140 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OpenAI\OpenAI;
use App\Jobs\FillPostMetadataJob;
use App\Models\Post;
use App\Models\SerpUrl;
use App\Models\SerpUrlResearch;
use App\Models\ServiceCostUsage;
use Exception;
use Mis3085\Tiktoken\Facades\Tiktoken;
class WriteWithAITask
{
public static function handle(int $serp_url_id)
{
$serp_url = SerpUrl::find($serp_url_id);
if (is_null($serp_url)) {
return;
}
$serp_url_researches = SerpUrlResearch::where('serp_url_id', $serp_url->id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->get();
$user_prompt = '';
$total_tokens = 0;
foreach ($serp_url_researches as $serp_url_research) {
$sentences = self::markdownToSentences($serp_url_research->content);
//dump($sentences);
foreach ($sentences as $key => $sentence) {
if ($key == 0) {
$user_prompt .= "ARTICLE:\n";
}
$current_tokens = Tiktoken::count($sentence);
if ($current_tokens + $total_tokens > 4096) {
break 2;
} else {
$user_prompt .= $sentence."\n";
$total_tokens += $current_tokens;
}
}
$user_prompt .= "\n\n";
}
//dd($user_prompt);
$ai_writeup_response = OpenAI::writeArticle($user_prompt, 1536, 30);
//dd($ai_writeup_response);
if ((isset($ai_writeup_response->output)) && (! is_empty($ai_writeup_response->output))) {
$output = self::extractRemoveFirstHeading($ai_writeup_response->output);
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $ai_writeup_response->cost;
$service_cost_usage->name = 'openai-writeArticle';
$service_cost_usage->reference_1 = 'serp_url';
$service_cost_usage->reference_2 = strval($serp_url->id);
$service_cost_usage->output = $ai_writeup_response;
$service_cost_usage->save();
$post = Post::where('serp_url_id', $serp_url->id)->first();
if (is_null($post)) {
$post = new Post;
$post->serp_url_id = $serp_url->id;
}
if (! is_empty($output->title)) {
$post->title = $output->title;
} else {
if (! is_null($serp_url->suggestion_data)) {
if (isset($serp_url->suggestion_data->article_headings)) {
if (count($serp_url->suggestion_data->article_headings) > 0) {
$post->title = $serp_url->suggestion_data?->article_headings[0];
}
}
}
}
if (is_empty($post->title)) {
$post->title = $serp_url->title;
}
$post->slug = str_slug($post->title);
$post->body = $output->content;
$post->bites = null;
$post->metadata = null;
if ($post->save()) {
FillPostMetadataJob::dispatch($post->id)->onQueue('default')->onConnection('default');
}
} else {
throw new Exception('OpenAI failed to write');
}
}
private static function markdownToSentences($markdownContent)
{
// Split the content on punctuation followed by a space or end of string
$pattern = '/(?<=[.!?])\s+|\z/';
// Split the content into sentences
$sentences = preg_split($pattern, $markdownContent, -1, PREG_SPLIT_NO_EMPTY);
// Return the array of sentences
return $sentences;
}
private static function extractRemoveFirstHeading($markdownContent)
{
// Pattern to match the first markdown heading of any level
$pattern = '/^(#+)\s*(.+)$/m';
// Try to find the first heading
if (preg_match($pattern, $markdownContent, $matches)) {
$title = $matches[2]; // The first heading becomes the title
// Remove the first heading from the content
$updatedContent = preg_replace($pattern, '', $markdownContent, 1);
return (object) ['title' => $title, 'content' => trim($updatedContent)];
}
// Return original content if no heading found
return (object) ['title' => '', 'content' => $markdownContent];
}
}