Add (initial): futurewalker code
This commit is contained in:
132
app/Jobs/Tasks/BrowseDFSForResearchTask.php
Normal file
132
app/Jobs/Tasks/BrowseDFSForResearchTask.php
Normal file
@@ -0,0 +1,132 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\DFS\DFSSerp;
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Helpers\ThirdParty\DFS\SettingSerpLiveAdvanced;
|
||||
use App\Jobs\CrawlUrlResearchJob;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\SerpUrlResearch;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use Exception;
|
||||
|
||||
class BrowseDFSForResearchTask
|
||||
{
|
||||
public static function handle(int $serp_url_id)
|
||||
{
|
||||
$serp_url = SerpUrl::find($serp_url_id);
|
||||
|
||||
if ((! is_null($serp_url)) && (! is_null($serp_url->suggestion_data))) {
|
||||
if (isset($serp_url->suggestion_data->proposed_search_queries)) {
|
||||
if (count($serp_url->suggestion_data->proposed_search_queries) > 0) {
|
||||
$search_query = $serp_url->suggestion_data->proposed_search_queries[0];
|
||||
|
||||
// $serp_model = new SettingSerpLiveAdvanced;
|
||||
|
||||
// $serp_model->setSe('google');
|
||||
// $serp_model->setSeType('organic');
|
||||
// $serp_model->setKeyword(strtolower($search_query));
|
||||
// $serp_model->setLocationName('United States');
|
||||
// //$serp_model->setDepth(100);
|
||||
// $serp_model->setLanguageCode('en');
|
||||
// $serp_res = $serp_model->getAsJson();
|
||||
|
||||
// print_r($serp_res);
|
||||
// die();
|
||||
$country_name = get_country_name_by_iso($serp_url->country_iso);
|
||||
|
||||
$serp_res = DFSSerp::liveAdvanced('google', 'news', $search_query, $country_name, 'en', 100);
|
||||
|
||||
try {
|
||||
$serp_obj = json_decode($serp_res, false, 512, JSON_THROW_ON_ERROR);
|
||||
|
||||
if ($serp_obj?->status_code == 20000) {
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $serp_obj->cost;
|
||||
$service_cost_usage->name = 'dataforseo-GoogleSerpApiAdvancedLiveOrganic';
|
||||
$service_cost_usage->reference_1 = 'google';
|
||||
$service_cost_usage->reference_2 = 'organic';
|
||||
$service_cost_usage->output = $serp_obj;
|
||||
$service_cost_usage->input_1 = $country_name;
|
||||
$service_cost_usage->input_2 = $search_query;
|
||||
$service_cost_usage->save();
|
||||
|
||||
$results = $serp_obj?->tasks[0]->result[0]?->items;
|
||||
|
||||
//$results = $serp_obj?->result[0]?->items;
|
||||
|
||||
// dump($serp_obj);
|
||||
// exit();
|
||||
|
||||
$saved_count = 0;
|
||||
|
||||
$first_serp_url_research = null;
|
||||
|
||||
foreach ($results as $key => $result) {
|
||||
if ($result->type == 'news_search') {
|
||||
$serp_url_research = SerpUrlResearch::where('url', $result->url)->where('serp_url_id', $serp_url_id)->first();
|
||||
|
||||
if (is_null($serp_url_research)) {
|
||||
//dump($result->url);
|
||||
|
||||
$serp_url_research = new SerpUrlResearch;
|
||||
$serp_url_research->serp_url_id = $serp_url_id;
|
||||
$serp_url_research->url = $result->url;
|
||||
$serp_url_research->query = $search_query;
|
||||
$serp_url_research->content = null;
|
||||
if ($serp_url_research->save()) {
|
||||
$saved_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($saved_count >= 10) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$first_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_id)->orderBy('created_at', 'ASC')->whereNull('content')->first();
|
||||
|
||||
CrawlUrlResearchJob::dispatch($first_serp_url_research->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
throw $e;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// {
|
||||
// "identified_keywords":[
|
||||
// "Humane AI Pin",
|
||||
// "costs",
|
||||
// "OpenAI",
|
||||
// "T-Mobile integration"
|
||||
// ],
|
||||
// "related_keywords":[
|
||||
// "artificial intelligence device",
|
||||
// "monthly subscription",
|
||||
// "OpenAI partnership",
|
||||
// "T-Mobile collaboration"
|
||||
// ],
|
||||
// "proposed_search_queries":[
|
||||
// "Humane AI Pin features",
|
||||
// "Cost of Humane AI Pin",
|
||||
// "Humane AI Pin integration with OpenAI and T-Mobile",
|
||||
// "Reviews of Humane AI Pin"
|
||||
// ],
|
||||
// "writing_tone":[
|
||||
// "engaging",
|
||||
// "informative"
|
||||
// ],
|
||||
// "article_headings":[
|
||||
// "Introduction to Humane AI Pin",
|
||||
// "Features of Humane AI Pin",
|
||||
// "Cost and Subscription Details",
|
||||
// "OpenAI and T-Mobile Integration"
|
||||
// ]
|
||||
// }
|
||||
@@ -2,46 +2,59 @@
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\DFS\DFSSerp;
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Helpers\ThirdParty\DFS\SettingSerpLiveAdvanced;
|
||||
use App\Models\Category;
|
||||
use App\Models\NewsSerpResult;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use DFSClientV3\DFSClient;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
|
||||
class GetNewsSerpTask
|
||||
class BrowseDFSLatestNewsTask
|
||||
{
|
||||
public static function handle(Category $category, $country_iso)
|
||||
public static function handle(string $keyword, $country_iso)
|
||||
{
|
||||
$country_name = get_country_name_by_iso($country_iso);
|
||||
|
||||
$keyword = strtolower("{$category->name}");
|
||||
// $client = new DFSClient(
|
||||
// config('dataforseo.login'),
|
||||
// config('dataforseo.password'),
|
||||
// config('dataforseo.timeout'),
|
||||
// config('dataforseo.api_version'),
|
||||
// config('dataforseo.url'),
|
||||
// );
|
||||
|
||||
$client = new DFSClient(
|
||||
config('dataforseo.login'),
|
||||
config('dataforseo.password'),
|
||||
config('dataforseo.timeout'),
|
||||
config('dataforseo.api_version'),
|
||||
config('dataforseo.url'),
|
||||
);
|
||||
// // You will receive SERP data specific to the indicated keyword, search engine, and location parameters
|
||||
// $serp_model = new SettingSerpLiveAdvanced();
|
||||
|
||||
// You will receive SERP data specific to the indicated keyword, search engine, and location parameters
|
||||
$serp_model = new SettingSerpLiveAdvanced();
|
||||
// $serp_model->setSe('google');
|
||||
// $serp_model->setSeType('news');
|
||||
// $serp_model->setSearchParam('&tbs=qdr:d');
|
||||
// $serp_model->setKeyword($keyword);
|
||||
// $serp_model->setLocationName($country_name);
|
||||
// $serp_model->setDepth(100);
|
||||
// $serp_model->setLanguageCode('en');
|
||||
// $serp_res = $serp_model->getAsJson();
|
||||
|
||||
$serp_model->setSe('google');
|
||||
$serp_model->setSeType('news');
|
||||
$serp_model->setKeyword($keyword);
|
||||
$serp_model->setLocationName($country_name);
|
||||
$serp_model->setDepth(100);
|
||||
$serp_model->setLanguageCode('en');
|
||||
$serp_res = $serp_model->getAsJson();
|
||||
$serp_res = DFSSerp::liveAdvanced('google', 'news', $keyword, $country_name, 'en', 100, '&tbs=qdr:d');
|
||||
|
||||
try {
|
||||
$serp_obj = json_decode($serp_res, false, 512, JSON_THROW_ON_ERROR);
|
||||
|
||||
if ($serp_obj?->status_code == 20000) {
|
||||
$json_file_name = config('platform.dataset.news.news_serp.file_prefix').str_slug($category->name).'-'.epoch_now_timestamp().'.json';
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $serp_obj->cost;
|
||||
$service_cost_usage->name = 'dataforseo-GoogleSerpApiAdvancedLiveNews';
|
||||
$service_cost_usage->reference_1 = 'google';
|
||||
$service_cost_usage->reference_2 = 'news';
|
||||
$service_cost_usage->output = $serp_obj;
|
||||
$service_cost_usage->input_1 = $country_name;
|
||||
$service_cost_usage->input_2 = $keyword;
|
||||
$service_cost_usage->save();
|
||||
|
||||
$json_file_name = config('platform.dataset.news.news_serp.file_prefix').str_slug($keyword).'-'.epoch_now_timestamp().'.json';
|
||||
|
||||
$upload_status = OSSUploader::uploadJson(
|
||||
config('platform.dataset.news.news_serp.driver'),
|
||||
@@ -50,9 +63,8 @@ public static function handle(Category $category, $country_iso)
|
||||
$serp_obj);
|
||||
|
||||
if ($upload_status) {
|
||||
|
||||
$news_serp_result = new NewsSerpResult;
|
||||
$news_serp_result->category_id = $category->id;
|
||||
$news_serp_result->category_name = $category->name;
|
||||
$news_serp_result->serp_provider = 'dfs';
|
||||
$news_serp_result->serp_se = 'google';
|
||||
$news_serp_result->serp_se_type = 'news';
|
||||
@@ -62,10 +74,7 @@ public static function handle(Category $category, $country_iso)
|
||||
$news_serp_result->result_count = $serp_obj?->tasks[0]?->result[0]?->items_count;
|
||||
$news_serp_result->filename = $json_file_name;
|
||||
$news_serp_result->status = 'initial';
|
||||
if ($news_serp_result->save()) {
|
||||
$category->serp_at = now();
|
||||
$category->save();
|
||||
}
|
||||
$news_serp_result->save();
|
||||
|
||||
return $news_serp_result;
|
||||
} else {
|
||||
207
app/Jobs/Tasks/CrawlUrlResearchTask.php
Normal file
207
app/Jobs/Tasks/CrawlUrlResearchTask.php
Normal file
@@ -0,0 +1,207 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Jobs\CrawlUrlResearchJob;
|
||||
use App\Jobs\WriteWithAIJob;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\SerpUrlResearch;
|
||||
use Exception;
|
||||
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
|
||||
use fivefilters\Readability\ParseException as ReadabilityParseException;
|
||||
use fivefilters\Readability\Readability;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use League\HTMLToMarkdown\HtmlConverter;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class CrawlUrlResearchTask
|
||||
{
|
||||
public static function handle(int $serp_url_research_id)
|
||||
{
|
||||
$serp_url_research = SerpUrlResearch::find($serp_url_research_id);
|
||||
|
||||
if (is_null($serp_url_research)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
|
||||
$response = Http::withHeaders([
|
||||
'User-Agent' => $user_agent,
|
||||
])
|
||||
->withOptions([
|
||||
'proxy' => get_smartproxy_rotating_server(),
|
||||
'timeout' => 10,
|
||||
'verify' => false,
|
||||
])
|
||||
->get($serp_url_research->url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$raw_html = $response->body();
|
||||
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
|
||||
} else {
|
||||
$raw_html = null;
|
||||
$response->throw();
|
||||
}
|
||||
|
||||
} catch (Exception $e) {
|
||||
$raw_html = null;
|
||||
//throw $e;
|
||||
}
|
||||
|
||||
if (! is_empty($raw_html)) {
|
||||
//dump(self::getMarkdownFromHtml($raw_html));
|
||||
|
||||
$serp_url_research->content = self::getMarkdownFromHtml($raw_html);
|
||||
$serp_url_research->main_image = self::getMainImageFromHtml($raw_html);
|
||||
|
||||
//dump($serp_url_research->content);
|
||||
} else {
|
||||
$serp_url_research->content = 'EMPTY CONTENT';
|
||||
}
|
||||
|
||||
$serp_url_research->save();
|
||||
|
||||
$completed_serp_url_researches_counts = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->count();
|
||||
|
||||
if ($completed_serp_url_researches_counts >= 3) {
|
||||
$serp_url = SerpUrl::find($serp_url_research->serp_url_id);
|
||||
|
||||
if (! is_null($serp_url)) {
|
||||
$serp_url->crawled = true;
|
||||
$serp_url->save();
|
||||
|
||||
WriteWithAIJob::dispatch($serp_url->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
} else {
|
||||
$next_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->whereNull('content')->first();
|
||||
|
||||
if (! is_null($next_serp_url_research)) {
|
||||
CrawlUrlResearchJob::dispatch($next_serp_url_research->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static function getMainImageFromHtml($html)
|
||||
{
|
||||
$r_configuration = new ReadabilityConfiguration();
|
||||
$r_configuration->setCharThreshold(20);
|
||||
|
||||
$readability = new Readability($r_configuration);
|
||||
|
||||
try {
|
||||
$readability->parse($html);
|
||||
|
||||
return $readability->getImage();
|
||||
//dd($readability);
|
||||
} catch (ReadabilityParseException $e) {
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getMarkdownFromHtml($html)
|
||||
{
|
||||
|
||||
$converter = new HtmlConverter([
|
||||
'strip_tags' => true,
|
||||
'strip_placeholder_links' => true,
|
||||
]);
|
||||
|
||||
$html = self::cleanHtml($html);
|
||||
|
||||
$markdown = $converter->convert($html);
|
||||
|
||||
//dd($markdown);
|
||||
|
||||
$markdown = self::reverseLTGT($markdown);
|
||||
|
||||
$markdown = self::normalizeNewLines($markdown);
|
||||
|
||||
$markdown = self::removeDuplicateLines($markdown);
|
||||
|
||||
return html_entity_decode(markdown_to_plaintext($markdown));
|
||||
}
|
||||
|
||||
private static function reverseLTGT($input)
|
||||
{
|
||||
$output = str_replace('<', '<', $input);
|
||||
$output = str_replace('>', '>', $output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
private static function removeDuplicateLines($string)
|
||||
{
|
||||
$lines = explode("\n", $string);
|
||||
$uniqueLines = array_unique($lines);
|
||||
|
||||
return implode("\n", $uniqueLines);
|
||||
}
|
||||
|
||||
private static function normalizeNewLines($content)
|
||||
{
|
||||
// Split the content by lines
|
||||
$lines = explode("\n", $content);
|
||||
|
||||
$processedLines = [];
|
||||
|
||||
for ($i = 0; $i < count($lines); $i++) {
|
||||
$line = trim($lines[$i]);
|
||||
|
||||
// If the line is an image markdown
|
||||
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
|
||||
// And if the next line is not empty and not another markdown structure
|
||||
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
|
||||
$line .= ' '.trim($lines[$i + 1]);
|
||||
$i++; // Skip the next line as we're merging it
|
||||
}
|
||||
}
|
||||
|
||||
// Add line to processedLines if it's not empty
|
||||
if (! empty($line)) {
|
||||
$processedLines[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse excessive newlines
|
||||
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
|
||||
|
||||
// Detect and replace the pattern
|
||||
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
|
||||
|
||||
// Replace multiple spaces with a dash separator
|
||||
$result = preg_replace('/ {2,}/', ' - ', $result);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private static function cleanHtml($htmlContent)
|
||||
{
|
||||
$crawler = new Crawler($htmlContent);
|
||||
|
||||
// Define tags to remove completely
|
||||
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
|
||||
|
||||
foreach ($tagsToRemove as $tag) {
|
||||
$crawler->filter($tag)->each(function ($node) {
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->removeChild($child);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Replace <span> tags with their inner content
|
||||
$crawler->filter('span')->each(function ($node) {
|
||||
$replacement = new \DOMText($node->text());
|
||||
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->replaceChild($replacement, $child);
|
||||
}
|
||||
});
|
||||
|
||||
return $crawler->outerHtml();
|
||||
}
|
||||
}
|
||||
267
app/Jobs/Tasks/FillPostMetadataTask.php
Normal file
267
app/Jobs/Tasks/FillPostMetadataTask.php
Normal file
@@ -0,0 +1,267 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Jobs\SchedulePublishPost;
|
||||
use App\Models\Entity;
|
||||
use App\Models\PostEntity;
|
||||
use App\Models\Category;
|
||||
use App\Models\Post;
|
||||
use App\Models\PostCategory;
|
||||
use App\Models\SerpUrlResearch;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Image;
|
||||
|
||||
class FillPostMetadataTask
|
||||
{
|
||||
public static function handle(int $post_id)
|
||||
{
|
||||
$post = Post::find($post_id);
|
||||
|
||||
if (is_null($post)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (! is_null($post->metadata)) {
|
||||
$post_meta_response = $post->metadata;
|
||||
} else {
|
||||
$post_meta_response = OpenAI::getArticleMeta($post->body, 1536, 30);
|
||||
|
||||
if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $post_meta_response->cost;
|
||||
$service_cost_usage->name = 'openai-getArticleMeta';
|
||||
$service_cost_usage->reference_1 = 'post';
|
||||
$service_cost_usage->reference_2 = strval($post->id);
|
||||
$service_cost_usage->output = $post_meta_response;
|
||||
$service_cost_usage->save();
|
||||
}
|
||||
}
|
||||
|
||||
//dump($post_meta_response);
|
||||
|
||||
if ((isset($post_meta_response->output)) && (! is_null($post_meta_response->output))) {
|
||||
|
||||
$post->metadata = $post_meta_response;
|
||||
|
||||
if (isset($post_meta_response->output->keywords)) {
|
||||
if (count($post_meta_response->output->keywords) > 0) {
|
||||
$post->keywords = $post_meta_response->output->keywords;
|
||||
|
||||
$post->main_keyword = $post_meta_response->output->keywords[0];
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->title)) {
|
||||
if ((is_empty($post->title)) && (! is_empty($post_meta_response->output->title))) {
|
||||
$post->title = $post_meta_response->output->title;
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->summary)) {
|
||||
if (! is_empty($post_meta_response->output->summary)) {
|
||||
$post->bites = $post_meta_response->output->summary;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (is_empty($post->slug)) {
|
||||
$post->slug = str_slug($post->title);
|
||||
}
|
||||
|
||||
if (is_empty($post->featured_image)) {
|
||||
$post = self::setPostImage($post);
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->society_impact))
|
||||
{
|
||||
if (!is_empty($post_meta_response->output->society_impact))
|
||||
{
|
||||
$post->society_impact = $post_meta_response->output->society_impact;
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($post_meta_response->output->society_impact_level))
|
||||
{
|
||||
if (!is_empty($post_meta_response->output->society_impact_level))
|
||||
{
|
||||
$post->society_impact = $post_meta_response->output->society_impact_level;
|
||||
}
|
||||
}
|
||||
|
||||
if ($post->save()) {
|
||||
|
||||
|
||||
// Set Category
|
||||
|
||||
$category_name = 'Updates';
|
||||
|
||||
if ((isset($post_meta_response->output->category)) && (!is_empty($post_meta_response->output->category)))
|
||||
{
|
||||
$category_name = $post_meta_response?->output?->category;
|
||||
}
|
||||
|
||||
$category = Category::where('name', $category_name)->first();
|
||||
|
||||
if (is_null($category))
|
||||
{
|
||||
$category = Category::where('name', 'Updates')->first();
|
||||
}
|
||||
|
||||
// Set Post Category
|
||||
$post_category = PostCategory::where('post_id', $post->id)->first();
|
||||
|
||||
if (is_null($post_category))
|
||||
{
|
||||
$post_category = new PostCategory;
|
||||
$post_category->post_id = $post->id;
|
||||
}
|
||||
$post_category->category_id = $category->id;
|
||||
|
||||
$post_category->save();
|
||||
|
||||
|
||||
// Set Post Entities
|
||||
if (isset($post_meta_response->output->entities))
|
||||
{
|
||||
$entity_names = [];
|
||||
|
||||
if (is_array($post_meta_response->output->entities))
|
||||
{
|
||||
$entity_names = $post_meta_response->output->entities;
|
||||
}
|
||||
|
||||
if (count($entity_names) > 0)
|
||||
{
|
||||
$previous_post_entities = PostEntity::where('post_id', $post->id)->delete();
|
||||
|
||||
foreach ($entity_names as $entity_name)
|
||||
{
|
||||
$entity_name = trim($entity_name);
|
||||
|
||||
$entity = Entity::where('name', $entity_name)->first();
|
||||
|
||||
if (is_null($entity))
|
||||
{
|
||||
$entity = new Entity;
|
||||
$entity->name = $entity_name;
|
||||
$entity->slug = str_slug($entity_name);
|
||||
$entity->save();
|
||||
}
|
||||
|
||||
|
||||
$post_entity = PostEntity::where('post_id', $post->id)
|
||||
->where('entity_id', $entity->id)
|
||||
->first();
|
||||
|
||||
if (is_null($post_entity))
|
||||
{
|
||||
$post_entity = new PostEntity;
|
||||
$post_entity->post_id = $post->id;
|
||||
$post_entity->entity_id = $entity->id;
|
||||
$post_entity->save();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set Schedule Publish
|
||||
SchedulePublishPost::dispatch($post->id, 'future')->onQueue('default')->onConnection('default');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static function setPostImage($post)
|
||||
{
|
||||
$serp_url_researches = SerpUrlResearch::where('serp_url_id', $post->serp_url_id)->get();
|
||||
|
||||
$main_image_url = null;
|
||||
|
||||
foreach ($serp_url_researches as $serp_url_research) {
|
||||
if (! is_empty($serp_url_research->main_image)) {
|
||||
if (is_valid_url($serp_url_research->main_image)) {
|
||||
|
||||
if (is_empty($serp_url_research->main_image)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$main_image_url = $serp_url_research->main_image;
|
||||
|
||||
$image_response = Http::timeout(300)->withHeaders([
|
||||
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
|
||||
])->get($main_image_url);
|
||||
|
||||
$image_content = $image_response->body();
|
||||
|
||||
// Get the size of the image content in KB
|
||||
$imageSizeInKb = strlen($image_response->body()) / 1024;
|
||||
|
||||
// Skip this iteration if the image exceeds the maximum size
|
||||
if ($imageSizeInKb > 1024) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$canvas_width = 1080;
|
||||
$canvas_height = 608;
|
||||
|
||||
$thumb_width = 540;
|
||||
$thumb_height = 78;
|
||||
|
||||
// Create an image from the fetched content
|
||||
$image = Image::make($image_content);
|
||||
|
||||
// Check if the image is wider than it is tall (landscape orientation)
|
||||
if ($image->width() > $image->height()) {
|
||||
// Resize the image to fill the canvas width while maintaining aspect ratio
|
||||
$image->resize($canvas_width, null, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
} else {
|
||||
// Resize the image to fill the canvas height while maintaining aspect ratio
|
||||
$image->resize(null, $canvas_height, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
});
|
||||
}
|
||||
|
||||
// Fit the image to the canvas size, without gaps
|
||||
$image->fit($canvas_width, $canvas_height, function ($constraint) {
|
||||
$constraint->upsize();
|
||||
});
|
||||
|
||||
// Create a thumbnail by cloning the original image and resizing it
|
||||
$thumb = clone $image;
|
||||
$thumb->resize($thumb_width, $thumb_height, function ($constraint) {
|
||||
$constraint->aspectRatio();
|
||||
$constraint->upsize();
|
||||
});
|
||||
|
||||
// Create a image filename
|
||||
$epoch_now_timestamp = epoch_now_timestamp();
|
||||
$filename = $post->slug.'-'.$epoch_now_timestamp.'.jpg';
|
||||
$thumb_filename = $post->slug.'-'.$epoch_now_timestamp.'_thumb.jpg';
|
||||
|
||||
OSSUploader::uploadFile('r2', 'post_images_2/', $filename, (string) $image->stream('jpeg', 75));
|
||||
OSSUploader::uploadFile('r2', 'post_images_2/', $thumb_filename, (string) $thumb->stream('jpeg', 50));
|
||||
|
||||
$post->featured_image = 'post_images_2/'.$filename;
|
||||
|
||||
$image->destroy();
|
||||
|
||||
try {
|
||||
break;
|
||||
} catch (Exception $e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $post;
|
||||
}
|
||||
}
|
||||
@@ -94,7 +94,7 @@ public static function handle($post)
|
||||
}
|
||||
}
|
||||
|
||||
//return $news_serp_result;
|
||||
//return $news_serp_result;
|
||||
} else {
|
||||
throw new Exception('Uploading failed', 1);
|
||||
}
|
||||
|
||||
54
app/Jobs/Tasks/IdentifyCrawlSourcesTask.php
Normal file
54
app/Jobs/Tasks/IdentifyCrawlSourcesTask.php
Normal file
@@ -0,0 +1,54 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Jobs\BrowseDFSForResearchJob;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\ServiceCostUsage;
|
||||
|
||||
class IdentifyCrawlSourcesTask
|
||||
{
|
||||
public static function handle(int $serp_url_id)
|
||||
{
|
||||
$serp_url = SerpUrl::find($serp_url_id);
|
||||
|
||||
if (! is_null($serp_url)) {
|
||||
|
||||
$attempt = 0;
|
||||
$maxAttempts = 3;
|
||||
$suggestion_response = null;
|
||||
|
||||
while ($attempt < $maxAttempts && ($suggestion_response === null || $suggestion_response->output === null)) {
|
||||
$suggestion_response = OpenAI::titleSuggestions($serp_url->title);
|
||||
|
||||
//dump($suggestion_response);
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $suggestion_response->cost;
|
||||
$service_cost_usage->name = 'openai-titleSuggestions';
|
||||
$service_cost_usage->reference_1 = 'serp_url';
|
||||
$service_cost_usage->reference_2 = strval($serp_url->id);
|
||||
$service_cost_usage->output = $suggestion_response;
|
||||
$service_cost_usage->save();
|
||||
|
||||
$attempt++;
|
||||
|
||||
// If the output is not null, break out of the loop
|
||||
if ($suggestion_response !== null && $suggestion_response->output !== null) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Optional: sleep for a bit before retrying
|
||||
sleep(1); // sleep for 1 second
|
||||
}
|
||||
|
||||
if (! is_null($suggestion_response->output)) {
|
||||
$serp_url->suggestion_data = $suggestion_response->output;
|
||||
if ($serp_url->save()) {
|
||||
BrowseDFSForResearchJob::dispatch($serp_url_id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,15 +2,19 @@
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use App\Jobs\IdentifyCrawlSourcesJob;
|
||||
use App\Models\Category;
|
||||
use App\Models\NewsSerpResult;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use Carbon\Carbon;
|
||||
use Exception;
|
||||
|
||||
class ParseNewsSerpDomainsTask
|
||||
class ParseDFSNewsTask
|
||||
{
|
||||
public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1)
|
||||
public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 100)
|
||||
{
|
||||
//dd($news_serp_result->category->serp_at);
|
||||
|
||||
@@ -35,16 +39,47 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
|
||||
|
||||
foreach ($serp_results as $serp_item) {
|
||||
|
||||
if ($serp_item->type != 'news_search') {
|
||||
continue;
|
||||
}
|
||||
|
||||
//dump($serp_item);
|
||||
|
||||
if (is_empty($serp_item->url)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if (!str_contains($serp_item->time_published, "hours"))
|
||||
// {
|
||||
// continue;
|
||||
// }
|
||||
$blacklist_keywords = config('platform.global.blacklist_keywords_serp');
|
||||
|
||||
$blacklist_domains = config('platform.global.blacklist_domains_serp');
|
||||
|
||||
$skipItem = false;
|
||||
|
||||
foreach ($blacklist_domains as $domain) {
|
||||
if (str_contains($serp_item->domain, $domain)) {
|
||||
$skipItem = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (! $skipItem) {
|
||||
$title = strtolower($serp_item->title);
|
||||
$snippet = strtolower($serp_item->snippet);
|
||||
|
||||
// Check if any unwanted word is in the title or snippet
|
||||
|
||||
foreach ($blacklist_keywords as $word) {
|
||||
if (strpos($title, $word) !== false || strpos($snippet, $word) !== false) {
|
||||
$skipItem = true;
|
||||
break; // Break the inner loop as we found an unwanted word
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip this iteration if an unwanted word was found
|
||||
if ($skipItem) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$serp_url = SerpUrl::where('url', $serp_item->url)->first();
|
||||
|
||||
@@ -69,14 +104,14 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
|
||||
|
||||
//dd($valid_serps);
|
||||
|
||||
$serp_titles = [];
|
||||
|
||||
foreach ($valid_serps as $serp_item) {
|
||||
|
||||
//dd($serp_item);
|
||||
$serp_url = SerpUrl::where('url', self::normalizeUrl($serp_item->url))->first();
|
||||
|
||||
if (is_null($serp_url)) {
|
||||
$serp_url = new SerpUrl;
|
||||
$serp_url->category_id = $news_serp_result->category_id;
|
||||
$serp_url->category_name = $news_serp_result->category_name;
|
||||
$serp_url->news_serp_result_id = $news_serp_result->id;
|
||||
}
|
||||
|
||||
@@ -85,20 +120,47 @@ public static function handle(NewsSerpResult $news_serp_result, $serp_counts = 1
|
||||
$serp_url->country_iso = $news_serp_result->serp_country_iso;
|
||||
|
||||
if (! is_empty($serp_item->title)) {
|
||||
$serp_url->title = $serp_item->title;
|
||||
$serp_url->title = remove_newline($serp_item->title);
|
||||
}
|
||||
|
||||
if (! is_empty($serp_item->snippet)) {
|
||||
$serp_url->description = $serp_item->snippet;
|
||||
$serp_url->description = remove_newline($serp_item->snippet);
|
||||
}
|
||||
|
||||
if ($serp_url->isDirty()) {
|
||||
$serp_url->serp_at = $news_serp_result->category->serp_at;
|
||||
$serp_url->serp_at = now();
|
||||
}
|
||||
|
||||
if ((isset($serp_item->timestamp)) && (! is_empty($serp_item->timestamp))) {
|
||||
$serp_url->url_posted_at = Carbon::parse($serp_item->timestamp);
|
||||
} else {
|
||||
$serp_url->url_posted_at = now();
|
||||
}
|
||||
|
||||
if ($serp_url->save()) {
|
||||
$success = true;
|
||||
}
|
||||
$serp_titles[$serp_url->id] = $serp_url->title;
|
||||
|
||||
}
|
||||
|
||||
$ids_response = OpenAI::topTitlePicksById(json_encode($serp_titles));
|
||||
|
||||
if (isset($ids_response->output->ids)) {
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $ids_response->cost;
|
||||
$service_cost_usage->name = 'openai-topTitlePicksById';
|
||||
$service_cost_usage->reference_1 = 'news_serp_result';
|
||||
$service_cost_usage->reference_2 = strval($news_serp_result->id);
|
||||
$service_cost_usage->output = $ids_response;
|
||||
$service_cost_usage->save();
|
||||
|
||||
$selected_serp_urls = SerpUrl::whereIn('id', $ids_response->output->ids)->update(['picked' => true]);
|
||||
|
||||
foreach ($ids_response->output->ids as $id) {
|
||||
IdentifyCrawlSourcesJob::dispatch($id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,12 +9,19 @@
|
||||
|
||||
class PublishIndexPostTask
|
||||
{
|
||||
public static function handle(Post $post)
|
||||
public static function handle(int $post_id)
|
||||
{
|
||||
$post->published_at = now();
|
||||
$post = Post::find($post_id);
|
||||
|
||||
if (is_null($post))
|
||||
{
|
||||
return ;
|
||||
}
|
||||
|
||||
$post->status = 'publish';
|
||||
|
||||
if ($post->save()) {
|
||||
if (app()->environment() == 'production') {
|
||||
if ((app()->environment() == 'production') && (config('platform.global.indexing'))) {
|
||||
$post_url = route('front.post', ['slug' => $post->slug, 'category_slug' => $post->category->slug]);
|
||||
|
||||
try {
|
||||
|
||||
76
app/Jobs/Tasks/SchedulePublishTask.php
Normal file
76
app/Jobs/Tasks/SchedulePublishTask.php
Normal file
@@ -0,0 +1,76 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Models\Post;
|
||||
use App\Notifications\PostIncomplete;
|
||||
use Notification;
|
||||
|
||||
class SchedulePublishTask
|
||||
{
|
||||
public static function handle($post_id, $post_status = 'publish')
|
||||
{
|
||||
sleep(2);
|
||||
|
||||
$post = Post::find($post_id);
|
||||
|
||||
if (is_null($post)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (! in_array($post->status, ['future', 'draft', 'publish'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ((is_empty($post->title)) || (is_empty($post->slug)) || (is_empty($post->main_keyword)) || (is_empty($post->keywords)) || (is_empty($post->bites)) || (is_empty($post->featured_image)) || (is_empty($post->body)) || (is_empty($post->metadata))) {
|
||||
Notification::route(get_notification_channel(), get_notification_user_id())->notify(new PostIncomplete($post));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
TODO:
|
||||
|
||||
- to determine a published_at time, first check if there are any post with existing published_at date.
|
||||
|
||||
- if there are no other posts except for the current post, then the current post published_at is now().
|
||||
|
||||
- if there are other posts but all of them published_at is null, then the current post published_at is now().
|
||||
|
||||
- if there are other posts and there are non null published_at,
|
||||
-- first find the latest published post (latest published_at).
|
||||
-- if the latest published_at datetime is before now, then published_at is null.
|
||||
-- if the latest published_at datetime is after now, then current post published_at should be 1 hour after the latest published_at
|
||||
|
||||
-- the idea is published_posts should be spreaded accross by an hour if found.
|
||||
|
||||
*/
|
||||
|
||||
// Check if there are any other posts with a set published_at date
|
||||
$latest_published_post = Post::where('id', '!=', $post_id)->whereNotNull('published_at')->orderBy('published_at', 'DESC')->first();
|
||||
|
||||
//dd($latest_published_post);
|
||||
|
||||
if (is_null($latest_published_post)) {
|
||||
$post->published_at = now();
|
||||
} else {
|
||||
if ($latest_published_post->published_at->lt(now())) {
|
||||
|
||||
$new_time = now();
|
||||
|
||||
} else {
|
||||
|
||||
$new_time = clone $latest_published_post->published_at;
|
||||
|
||||
}
|
||||
|
||||
$new_time->addMinutes(rand(40, 60));
|
||||
$post->published_at = $new_time;
|
||||
}
|
||||
|
||||
$post->published_at->subDays(1);
|
||||
|
||||
$post->status = $post_status; // Assuming you want to update the status to future
|
||||
$post->save();
|
||||
}
|
||||
}
|
||||
140
app/Jobs/Tasks/WriteWithAITask.php
Normal file
140
app/Jobs/Tasks/WriteWithAITask.php
Normal file
@@ -0,0 +1,140 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OpenAI\OpenAI;
|
||||
use App\Jobs\FillPostMetadataJob;
|
||||
use App\Models\Post;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\SerpUrlResearch;
|
||||
use App\Models\ServiceCostUsage;
|
||||
use Exception;
|
||||
use Mis3085\Tiktoken\Facades\Tiktoken;
|
||||
|
||||
class WriteWithAITask
|
||||
{
|
||||
public static function handle(int $serp_url_id)
|
||||
{
|
||||
$serp_url = SerpUrl::find($serp_url_id);
|
||||
|
||||
if (is_null($serp_url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$serp_url_researches = SerpUrlResearch::where('serp_url_id', $serp_url->id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->get();
|
||||
|
||||
$user_prompt = '';
|
||||
$total_tokens = 0;
|
||||
|
||||
foreach ($serp_url_researches as $serp_url_research) {
|
||||
|
||||
$sentences = self::markdownToSentences($serp_url_research->content);
|
||||
|
||||
//dump($sentences);
|
||||
|
||||
foreach ($sentences as $key => $sentence) {
|
||||
|
||||
if ($key == 0) {
|
||||
$user_prompt .= "ARTICLE:\n";
|
||||
}
|
||||
|
||||
$current_tokens = Tiktoken::count($sentence);
|
||||
|
||||
if ($current_tokens + $total_tokens > 4096) {
|
||||
break 2;
|
||||
} else {
|
||||
$user_prompt .= $sentence."\n";
|
||||
$total_tokens += $current_tokens;
|
||||
}
|
||||
|
||||
}
|
||||
$user_prompt .= "\n\n";
|
||||
}
|
||||
|
||||
//dd($user_prompt);
|
||||
|
||||
$ai_writeup_response = OpenAI::writeArticle($user_prompt, 1536, 30);
|
||||
|
||||
//dd($ai_writeup_response);
|
||||
|
||||
if ((isset($ai_writeup_response->output)) && (! is_empty($ai_writeup_response->output))) {
|
||||
$output = self::extractRemoveFirstHeading($ai_writeup_response->output);
|
||||
|
||||
$service_cost_usage = new ServiceCostUsage;
|
||||
$service_cost_usage->cost = $ai_writeup_response->cost;
|
||||
$service_cost_usage->name = 'openai-writeArticle';
|
||||
$service_cost_usage->reference_1 = 'serp_url';
|
||||
$service_cost_usage->reference_2 = strval($serp_url->id);
|
||||
$service_cost_usage->output = $ai_writeup_response;
|
||||
$service_cost_usage->save();
|
||||
|
||||
$post = Post::where('serp_url_id', $serp_url->id)->first();
|
||||
|
||||
if (is_null($post)) {
|
||||
$post = new Post;
|
||||
$post->serp_url_id = $serp_url->id;
|
||||
}
|
||||
|
||||
if (! is_empty($output->title)) {
|
||||
$post->title = $output->title;
|
||||
} else {
|
||||
|
||||
if (! is_null($serp_url->suggestion_data)) {
|
||||
if (isset($serp_url->suggestion_data->article_headings)) {
|
||||
if (count($serp_url->suggestion_data->article_headings) > 0) {
|
||||
$post->title = $serp_url->suggestion_data?->article_headings[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_empty($post->title)) {
|
||||
$post->title = $serp_url->title;
|
||||
}
|
||||
|
||||
$post->slug = str_slug($post->title);
|
||||
|
||||
$post->body = $output->content;
|
||||
|
||||
$post->bites = null;
|
||||
$post->metadata = null;
|
||||
|
||||
if ($post->save()) {
|
||||
FillPostMetadataJob::dispatch($post->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
} else {
|
||||
throw new Exception('OpenAI failed to write');
|
||||
}
|
||||
}
|
||||
|
||||
private static function markdownToSentences($markdownContent)
|
||||
{
|
||||
// Split the content on punctuation followed by a space or end of string
|
||||
$pattern = '/(?<=[.!?])\s+|\z/';
|
||||
|
||||
// Split the content into sentences
|
||||
$sentences = preg_split($pattern, $markdownContent, -1, PREG_SPLIT_NO_EMPTY);
|
||||
|
||||
// Return the array of sentences
|
||||
return $sentences;
|
||||
}
|
||||
|
||||
private static function extractRemoveFirstHeading($markdownContent)
|
||||
{
|
||||
// Pattern to match the first markdown heading of any level
|
||||
$pattern = '/^(#+)\s*(.+)$/m';
|
||||
|
||||
// Try to find the first heading
|
||||
if (preg_match($pattern, $markdownContent, $matches)) {
|
||||
$title = $matches[2]; // The first heading becomes the title
|
||||
|
||||
// Remove the first heading from the content
|
||||
$updatedContent = preg_replace($pattern, '', $markdownContent, 1);
|
||||
|
||||
return (object) ['title' => $title, 'content' => trim($updatedContent)];
|
||||
}
|
||||
|
||||
// Return original content if no heading found
|
||||
return (object) ['title' => '', 'content' => $markdownContent];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user