Add (scraper)
Update (ai): integrate scraper
This commit is contained in:
@@ -25,6 +25,21 @@ public static function readJson($storage_driver, $relative_directory, $filename)
|
||||
return null;
|
||||
}
|
||||
|
||||
public static function readFile($storage_driver, $relative_directory, $filename)
|
||||
{
|
||||
$filepath = rtrim($relative_directory, '/').'/'.$filename;
|
||||
|
||||
try {
|
||||
return Storage::disk($storage_driver)->get($filepath);
|
||||
|
||||
} catch (\Exception $e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public static function uploadJson($storage_driver, $relative_directory, $filename, $jsonData)
|
||||
{
|
||||
$jsonString = json_encode($jsonData, JSON_PRETTY_PRINT);
|
||||
|
||||
@@ -45,6 +45,30 @@ public static function writeArticle($title, $description, $article_type, $min, $
|
||||
|
||||
}
|
||||
|
||||
public static function createNewArticleTitle($current_title, $supporting_data)
|
||||
{
|
||||
$system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate an easy-to-read unique, helpful article title.\n\n
|
||||
Requirements:\n
|
||||
2 descriptive photos keywords to represent article title when put together side-by-side\n
|
||||
No realtime information required\n
|
||||
No guides and how tos\n
|
||||
No punctuation in titles especially colons :\n
|
||||
90-130 characters\n\n
|
||||
|
||||
return in following json format {\"main_keyword\":\"(Main Keyword)\",\"title\":\"(Title in 90-130 letters)\",\"short_title\":\"(Short Title in 30-40 letters)\",\"article_type\":\"(How-tos|Guides|Interview|Review|Commentary|Feature|News|Editorial|Report|Research|Case-study|Overview|Tutorial|Update|Spotlight|Insights)\",\"description\":\"(SEO description based on main keyword)\",\"photo_keywords\":[\"photo keyword 1\",\"photo keyword 2\"]}";
|
||||
|
||||
|
||||
$user_prompt = "Article Title: {$current_title}\n Article Description: {$supporting_data}\n";
|
||||
|
||||
$reply = self::chatCompletion($system_prompt, $user_prompt, 'gpt-3.5-turbo');
|
||||
|
||||
try {
|
||||
return json_decode($reply, false);
|
||||
} catch (Exception $e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static function suggestArticleTitles($current_title, $supporting_data, $suggestion_counts)
|
||||
{
|
||||
$system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate {$suggestion_counts} easy-to-read unique, helpful title articles.\n\n
|
||||
|
||||
@@ -14,43 +14,52 @@ class GenerateArticleTask
|
||||
public static function handle(SerpUrl $serp_url)
|
||||
{
|
||||
|
||||
$ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1);
|
||||
// $ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1);
|
||||
|
||||
if (is_null($ai_titles)) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -2);
|
||||
// if (is_null($ai_titles)) {
|
||||
// return self::saveAndReturnSerpProcessStatus($serp_url, -2);
|
||||
// }
|
||||
|
||||
// $suggestion = null;
|
||||
|
||||
// // dump($ai_titles);
|
||||
|
||||
// try {
|
||||
// $random_key = array_rand($ai_titles?->suggestions, 1);
|
||||
|
||||
// $suggestion = $ai_titles->suggestions[$random_key];
|
||||
|
||||
// } catch (Exception $e) {
|
||||
// return self::saveAndReturnSerpProcessStatus($serp_url, -1);
|
||||
// }
|
||||
|
||||
// if (is_null($suggestion)) {
|
||||
// return self::saveAndReturnSerpProcessStatus($serp_url, -3);
|
||||
// }
|
||||
|
||||
$ai_suggestion = OpenAI::createNewArticleTitle($serp_url->title, $serp_url->description);
|
||||
|
||||
$readability_content = ScrapeUrlBodyTask::handle($serp_url->url);
|
||||
|
||||
if (is_null($readability_content))
|
||||
{
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -7);
|
||||
}
|
||||
|
||||
$suggestion = null;
|
||||
|
||||
// dump($ai_titles);
|
||||
|
||||
try {
|
||||
$random_key = array_rand($ai_titles?->suggestions, 1);
|
||||
|
||||
$suggestion = $ai_titles->suggestions[$random_key];
|
||||
|
||||
} catch (Exception $e) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -1);
|
||||
}
|
||||
|
||||
if (is_null($suggestion)) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -3);
|
||||
}
|
||||
|
||||
$markdown = OpenAI::writeArticle($suggestion->title, $suggestion->description, $suggestion->article_type, 500, 800);
|
||||
$markdown = OpenAI::writeArticle($ai_suggestion->title, $readability_content, $ai_suggestion->article_type ,500, 800);
|
||||
|
||||
if (is_empty($markdown)) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -4);
|
||||
}
|
||||
|
||||
$post = new Post;
|
||||
$post->title = $suggestion->title;
|
||||
$post->type = $suggestion->article_type;
|
||||
$post->short_title = $suggestion->short_title;
|
||||
$post->main_keyword = $ai_titles->main_keyword;
|
||||
$post->keywords = $suggestion->photo_keywords;
|
||||
$post->slug = str_slug($suggestion->title);
|
||||
$post->excerpt = $suggestion->description;
|
||||
$post->title = $ai_suggestion->title;
|
||||
$post->type = $ai_suggestion->article_type;
|
||||
$post->short_title = $ai_suggestion->short_title;
|
||||
$post->main_keyword = $ai_suggestion->main_keyword;
|
||||
$post->keywords = $ai_suggestion->photo_keywords;
|
||||
$post->slug = str_slug($ai_suggestion->title);
|
||||
$post->excerpt = $ai_suggestion->description;
|
||||
$post->author_id = Author::find(1)->id;
|
||||
$post->featured = false;
|
||||
$post->featured_image = null;
|
||||
|
||||
73
app/Jobs/Tasks/ScrapeUrlBodyTask.php
Normal file
73
app/Jobs/Tasks/ScrapeUrlBodyTask.php
Normal file
@@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use \Illuminate\Support\Facades\Http;
|
||||
use Carbon\Carbon;
|
||||
use Storage;
|
||||
use Exception;
|
||||
|
||||
use andreskrey\Readability\Readability;
|
||||
use andreskrey\Readability\Configuration;
|
||||
use andreskrey\Readability\ParseException;
|
||||
|
||||
class ScrapeUrlBodyTask
|
||||
{
|
||||
public static function handle(string $url)
|
||||
{
|
||||
$slug = str_slug($url);
|
||||
|
||||
$disk_url = '/scraped/' . $slug . '.html';
|
||||
|
||||
$html_content = null;
|
||||
|
||||
try {
|
||||
$html_content = OSSUploader::readFile('r2','/scraped/',$slug.'.html');
|
||||
|
||||
if (is_null($disk_url))
|
||||
{
|
||||
throw Exception('Not stored.');
|
||||
}
|
||||
}
|
||||
catch (Exception $e) {
|
||||
$html_content = null;
|
||||
}
|
||||
|
||||
if (is_null($html_content))
|
||||
{
|
||||
$proxy = 'gate.smartproxy.com:10000';
|
||||
$user = 'sp5bbkzj7e';
|
||||
$psw = 'yTtk2cc5kg23kIkSSr';
|
||||
|
||||
$response = Http::withOptions([
|
||||
'proxy' => "http://$user:$psw@$proxy",
|
||||
])->get($url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$html_content = $response->body();
|
||||
|
||||
OSSUploader::uploadFile('r2','/scraped/',$slug.'.html', $html_content);
|
||||
}
|
||||
}
|
||||
|
||||
//dump("Initial: " . strlen($html_content));
|
||||
|
||||
$readability = new Readability(new Configuration());
|
||||
|
||||
|
||||
try {
|
||||
$readability->parse($html_content);
|
||||
|
||||
$html_content = strip_tags($readability->getContent());
|
||||
//dd($readability);
|
||||
} catch (ParseException $e) {
|
||||
|
||||
}
|
||||
|
||||
//dump("After: " . strlen($html_content));
|
||||
|
||||
return $html_content;
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user