Add (scraper)
Update (ai): integrate scraper
This commit is contained in:
@@ -25,6 +25,21 @@ public static function readJson($storage_driver, $relative_directory, $filename)
|
||||
return null;
|
||||
}
|
||||
|
||||
public static function readFile($storage_driver, $relative_directory, $filename)
|
||||
{
|
||||
$filepath = rtrim($relative_directory, '/').'/'.$filename;
|
||||
|
||||
try {
|
||||
return Storage::disk($storage_driver)->get($filepath);
|
||||
|
||||
} catch (\Exception $e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public static function uploadJson($storage_driver, $relative_directory, $filename, $jsonData)
|
||||
{
|
||||
$jsonString = json_encode($jsonData, JSON_PRETTY_PRINT);
|
||||
|
||||
@@ -45,6 +45,30 @@ public static function writeArticle($title, $description, $article_type, $min, $
|
||||
|
||||
}
|
||||
|
||||
public static function createNewArticleTitle($current_title, $supporting_data)
|
||||
{
|
||||
$system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate an easy-to-read unique, helpful article title.\n\n
|
||||
Requirements:\n
|
||||
2 descriptive photos keywords to represent article title when put together side-by-side\n
|
||||
No realtime information required\n
|
||||
No guides and how tos\n
|
||||
No punctuation in titles especially colons :\n
|
||||
90-130 characters\n\n
|
||||
|
||||
return in following json format {\"main_keyword\":\"(Main Keyword)\",\"title\":\"(Title in 90-130 letters)\",\"short_title\":\"(Short Title in 30-40 letters)\",\"article_type\":\"(How-tos|Guides|Interview|Review|Commentary|Feature|News|Editorial|Report|Research|Case-study|Overview|Tutorial|Update|Spotlight|Insights)\",\"description\":\"(SEO description based on main keyword)\",\"photo_keywords\":[\"photo keyword 1\",\"photo keyword 2\"]}";
|
||||
|
||||
|
||||
$user_prompt = "Article Title: {$current_title}\n Article Description: {$supporting_data}\n";
|
||||
|
||||
$reply = self::chatCompletion($system_prompt, $user_prompt, 'gpt-3.5-turbo');
|
||||
|
||||
try {
|
||||
return json_decode($reply, false);
|
||||
} catch (Exception $e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static function suggestArticleTitles($current_title, $supporting_data, $suggestion_counts)
|
||||
{
|
||||
$system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate {$suggestion_counts} easy-to-read unique, helpful title articles.\n\n
|
||||
|
||||
@@ -14,43 +14,52 @@ class GenerateArticleTask
|
||||
public static function handle(SerpUrl $serp_url)
|
||||
{
|
||||
|
||||
$ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1);
|
||||
// $ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1);
|
||||
|
||||
if (is_null($ai_titles)) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -2);
|
||||
// if (is_null($ai_titles)) {
|
||||
// return self::saveAndReturnSerpProcessStatus($serp_url, -2);
|
||||
// }
|
||||
|
||||
// $suggestion = null;
|
||||
|
||||
// // dump($ai_titles);
|
||||
|
||||
// try {
|
||||
// $random_key = array_rand($ai_titles?->suggestions, 1);
|
||||
|
||||
// $suggestion = $ai_titles->suggestions[$random_key];
|
||||
|
||||
// } catch (Exception $e) {
|
||||
// return self::saveAndReturnSerpProcessStatus($serp_url, -1);
|
||||
// }
|
||||
|
||||
// if (is_null($suggestion)) {
|
||||
// return self::saveAndReturnSerpProcessStatus($serp_url, -3);
|
||||
// }
|
||||
|
||||
$ai_suggestion = OpenAI::createNewArticleTitle($serp_url->title, $serp_url->description);
|
||||
|
||||
$readability_content = ScrapeUrlBodyTask::handle($serp_url->url);
|
||||
|
||||
if (is_null($readability_content))
|
||||
{
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -7);
|
||||
}
|
||||
|
||||
$suggestion = null;
|
||||
|
||||
// dump($ai_titles);
|
||||
|
||||
try {
|
||||
$random_key = array_rand($ai_titles?->suggestions, 1);
|
||||
|
||||
$suggestion = $ai_titles->suggestions[$random_key];
|
||||
|
||||
} catch (Exception $e) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -1);
|
||||
}
|
||||
|
||||
if (is_null($suggestion)) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -3);
|
||||
}
|
||||
|
||||
$markdown = OpenAI::writeArticle($suggestion->title, $suggestion->description, $suggestion->article_type, 500, 800);
|
||||
$markdown = OpenAI::writeArticle($ai_suggestion->title, $readability_content, $ai_suggestion->article_type ,500, 800);
|
||||
|
||||
if (is_empty($markdown)) {
|
||||
return self::saveAndReturnSerpProcessStatus($serp_url, -4);
|
||||
}
|
||||
|
||||
$post = new Post;
|
||||
$post->title = $suggestion->title;
|
||||
$post->type = $suggestion->article_type;
|
||||
$post->short_title = $suggestion->short_title;
|
||||
$post->main_keyword = $ai_titles->main_keyword;
|
||||
$post->keywords = $suggestion->photo_keywords;
|
||||
$post->slug = str_slug($suggestion->title);
|
||||
$post->excerpt = $suggestion->description;
|
||||
$post->title = $ai_suggestion->title;
|
||||
$post->type = $ai_suggestion->article_type;
|
||||
$post->short_title = $ai_suggestion->short_title;
|
||||
$post->main_keyword = $ai_suggestion->main_keyword;
|
||||
$post->keywords = $ai_suggestion->photo_keywords;
|
||||
$post->slug = str_slug($ai_suggestion->title);
|
||||
$post->excerpt = $ai_suggestion->description;
|
||||
$post->author_id = Author::find(1)->id;
|
||||
$post->featured = false;
|
||||
$post->featured_image = null;
|
||||
|
||||
73
app/Jobs/Tasks/ScrapeUrlBodyTask.php
Normal file
73
app/Jobs/Tasks/ScrapeUrlBodyTask.php
Normal file
@@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
|
||||
use \Illuminate\Support\Facades\Http;
|
||||
use Carbon\Carbon;
|
||||
use Storage;
|
||||
use Exception;
|
||||
|
||||
use andreskrey\Readability\Readability;
|
||||
use andreskrey\Readability\Configuration;
|
||||
use andreskrey\Readability\ParseException;
|
||||
|
||||
class ScrapeUrlBodyTask
|
||||
{
|
||||
public static function handle(string $url)
|
||||
{
|
||||
$slug = str_slug($url);
|
||||
|
||||
$disk_url = '/scraped/' . $slug . '.html';
|
||||
|
||||
$html_content = null;
|
||||
|
||||
try {
|
||||
$html_content = OSSUploader::readFile('r2','/scraped/',$slug.'.html');
|
||||
|
||||
if (is_null($disk_url))
|
||||
{
|
||||
throw Exception('Not stored.');
|
||||
}
|
||||
}
|
||||
catch (Exception $e) {
|
||||
$html_content = null;
|
||||
}
|
||||
|
||||
if (is_null($html_content))
|
||||
{
|
||||
$proxy = 'gate.smartproxy.com:10000';
|
||||
$user = 'sp5bbkzj7e';
|
||||
$psw = 'yTtk2cc5kg23kIkSSr';
|
||||
|
||||
$response = Http::withOptions([
|
||||
'proxy' => "http://$user:$psw@$proxy",
|
||||
])->get($url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$html_content = $response->body();
|
||||
|
||||
OSSUploader::uploadFile('r2','/scraped/',$slug.'.html', $html_content);
|
||||
}
|
||||
}
|
||||
|
||||
//dump("Initial: " . strlen($html_content));
|
||||
|
||||
$readability = new Readability(new Configuration());
|
||||
|
||||
|
||||
try {
|
||||
$readability->parse($html_content);
|
||||
|
||||
$html_content = strip_tags($readability->getContent());
|
||||
//dd($readability);
|
||||
} catch (ParseException $e) {
|
||||
|
||||
}
|
||||
|
||||
//dump("After: " . strlen($html_content));
|
||||
|
||||
return $html_content;
|
||||
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,7 @@
|
||||
"php": "^8.1",
|
||||
"artesaos/seotools": "^1.2",
|
||||
"dipeshsukhia/laravel-html-minify": "^3.3",
|
||||
"fivefilters/readability.php": "^1.0",
|
||||
"graham-campbell/markdown": "^15.0",
|
||||
"guzzlehttp/guzzle": "^7.2",
|
||||
"intervention/image": "^2.7",
|
||||
|
||||
53
composer.lock
generated
53
composer.lock
generated
@@ -4,7 +4,7 @@
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "6befdc23980f16c166f9c783aa9cabf5",
|
||||
"content-hash": "b6f2e806cb687a1b5a4a3872968a2157",
|
||||
"packages": [
|
||||
{
|
||||
"name": "artesaos/seotools",
|
||||
@@ -720,6 +720,57 @@
|
||||
],
|
||||
"time": "2023-01-14T14:17:03+00:00"
|
||||
},
|
||||
{
|
||||
"name": "fivefilters/readability.php",
|
||||
"version": "v1.0.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/fivefilters/readability.php.git",
|
||||
"reference": "0d02e2916d1659bb79426969c5d48848ff402598"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/0d02e2916d1659bb79426969c5d48848ff402598",
|
||||
"reference": "0d02e2916d1659bb79426969c5d48848ff402598",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-dom": "*",
|
||||
"ext-mbstring": "*",
|
||||
"ext-xml": "*",
|
||||
"php": ">=5.6.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^5.7"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"andreskrey\\Readability\\": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"Apache-2.0"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Andres Rey",
|
||||
"email": "andreskrey@gmail.com",
|
||||
"role": "Lead Developer"
|
||||
}
|
||||
],
|
||||
"description": "A PHP port of Readability.js",
|
||||
"homepage": "https://github.com/andreskrey/readability",
|
||||
"keywords": [
|
||||
"html",
|
||||
"readability"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/fivefilters/readability.php/tree/v1.0.0"
|
||||
},
|
||||
"time": "2017-12-03T12:24:28+00:00"
|
||||
},
|
||||
{
|
||||
"name": "fruitcake/php-cors",
|
||||
"version": "v1.2.0",
|
||||
|
||||
@@ -5,10 +5,12 @@
|
||||
use App\Jobs\GenerateArticleJob;
|
||||
use App\Jobs\Tasks\GetNewsSerpTask;
|
||||
use App\Jobs\Tasks\ParseNewsSerpDomainsTask;
|
||||
use App\Jobs\Tasks\ScrapeUrlBodyTask;
|
||||
use App\Models\Category;
|
||||
use App\Models\NewsSerpResult;
|
||||
use App\Models\Post;
|
||||
use App\Models\SerpUrl;
|
||||
use Illuminate\Support\Facades\Request;
|
||||
use Illuminate\Support\Facades\Route;
|
||||
|
||||
/*
|
||||
@@ -22,40 +24,82 @@
|
||||
|
|
||||
*/
|
||||
|
||||
Route::get('/news_serp', function () {
|
||||
$category = Category::find(2);
|
||||
Route::get('/step-1', function (Request $request) {
|
||||
$category = Category::find($request->input('id'));
|
||||
$news_serp_result = GetNewsSerpTask::handle($category, 'US');
|
||||
dd($news_serp_result);
|
||||
dd($news_serp_result->id);
|
||||
});
|
||||
|
||||
Route::get('/news_serp_parse', function () {
|
||||
$news_serp_result = NewsSerpResult::find(3);
|
||||
$serp_urls = ParseNewsSerpDomainsTask::handle($news_serp_result);
|
||||
dd($serp_urls);
|
||||
Route::get('/step-2', function (Request $request) {
|
||||
$news_serp_result = NewsSerpResult::find($request->input('id', null));
|
||||
|
||||
if (is_null($news_serp_result))
|
||||
{
|
||||
abort(404);
|
||||
|
||||
}
|
||||
$task = ParseNewsSerpDomainsTask::handle($news_serp_result);
|
||||
|
||||
if ($task)
|
||||
{
|
||||
$serp_url = SerpUrl::latest()->first();
|
||||
|
||||
dd($serp_url->id);
|
||||
}
|
||||
});
|
||||
|
||||
Route::get('/write_article', function () {
|
||||
$serp_url = SerpUrl::find(2);
|
||||
Route::get('/step-3', function (Request $request) {
|
||||
$serp_url = SerpUrl::find($request->input('id', null));
|
||||
|
||||
if (is_null($serp_url))
|
||||
{
|
||||
abort(404);
|
||||
}
|
||||
|
||||
$task = GenerateArticleJob::dispatch($serp_url)->onQueue('default')->onConnection('default');
|
||||
|
||||
dd($task);
|
||||
});
|
||||
|
||||
Route::get('/gen_article_image', function () {
|
||||
Route::get('/step-4', function () {
|
||||
$post = Post::whereNull('featured_image')->where('status', 'draft')->first();
|
||||
|
||||
$task = GenerateArticleFeaturedImageJob::dispatch($post)->onQueue('default')->onConnection('default');
|
||||
|
||||
dd($task);
|
||||
});
|
||||
|
||||
Route::get('/suggest_titles', function () {
|
||||
$results = OpenAI::suggestArticleTitles("It's 2019s Electric: How Fisker Is Reinventing The Automotive Industry And \nExpanding Its Business", "Fisker's approach to building electric vehicles is deeply intertwined with \nits overall business philosophy: use less, use better,...s", 1);
|
||||
dd($results);
|
||||
Route::get('/step-5', function (Request $request) {
|
||||
$post = Post::find($request->input('id'));
|
||||
|
||||
if (is_null($post))
|
||||
{
|
||||
return abort(404);
|
||||
}
|
||||
|
||||
$post->published_at = now();
|
||||
dd($post->save());
|
||||
});
|
||||
|
||||
Route::get('/write_article_raw', function () {
|
||||
$results = OpenAI::writeArticle("Fisker's Vision for the Future of Electric Cars", "Explore Fisker's innovative vision for the future of electric cars and its impact on the automotive industry.", 'Article', 500, 800);
|
||||
dd($results);
|
||||
|
||||
// Route::get('/suggest_titles', function () {
|
||||
// $results = OpenAI::suggestArticleTitles("It's 2019s Electric: How Fisker Is Reinventing The Automotive Industry And \nExpanding Its Business", "Fisker's approach to building electric vehicles is deeply intertwined with \nits overall business philosophy: use less, use better,...s", 1);
|
||||
// dd($results);
|
||||
// });
|
||||
|
||||
// Route::get('/write_article_raw', function () {
|
||||
// $results = OpenAI::writeArticle("Fisker's Vision for the Future of Electric Cars", "Explore Fisker's innovative vision for the future of electric cars and its impact on the automotive industry.", 'Article', 500, 800);
|
||||
// dd($results);
|
||||
// });
|
||||
|
||||
|
||||
|
||||
Route::get('proxy_test', function() {
|
||||
$url = 'https://www.cnbc.com/2023/09/24/this-southern-city-is-the-no-1-place-to-start-your-own-business.html';
|
||||
|
||||
$task = ScrapeUrlBodyTask::handle($url);
|
||||
|
||||
dd($task);
|
||||
});
|
||||
|
||||
// Route::get('/image_gen', function() {
|
||||
|
||||
Reference in New Issue
Block a user