From c53918d03bee3a1556d6dceb8c33a7d5dd4e0cfe Mon Sep 17 00:00:00 2001 From: Charles Teh Date: Mon, 25 Sep 2023 19:39:13 +0800 Subject: [PATCH] Add (scraper) Update (ai): integrate scraper --- .../FirstParty/OSSUploader/OSSUploader.php | 15 ++++ app/Helpers/FirstParty/OpenAI/OpenAI.php | 24 ++++++ app/Jobs/Tasks/GenerateArticleTask.php | 65 +++++++++------- app/Jobs/Tasks/ScrapeUrlBodyTask.php | 73 ++++++++++++++++++ composer.json | 1 + composer.lock | 53 ++++++++++++- routes/tests.php | 76 +++++++++++++++---- 7 files changed, 262 insertions(+), 45 deletions(-) create mode 100644 app/Jobs/Tasks/ScrapeUrlBodyTask.php diff --git a/app/Helpers/FirstParty/OSSUploader/OSSUploader.php b/app/Helpers/FirstParty/OSSUploader/OSSUploader.php index 1e9effc..e13b576 100644 --- a/app/Helpers/FirstParty/OSSUploader/OSSUploader.php +++ b/app/Helpers/FirstParty/OSSUploader/OSSUploader.php @@ -25,6 +25,21 @@ public static function readJson($storage_driver, $relative_directory, $filename) return null; } + public static function readFile($storage_driver, $relative_directory, $filename) + { + $filepath = rtrim($relative_directory, '/').'/'.$filename; + + try { + return Storage::disk($storage_driver)->get($filepath); + + } catch (\Exception $e) { + return null; + } + + return null; + } + + public static function uploadJson($storage_driver, $relative_directory, $filename, $jsonData) { $jsonString = json_encode($jsonData, JSON_PRETTY_PRINT); diff --git a/app/Helpers/FirstParty/OpenAI/OpenAI.php b/app/Helpers/FirstParty/OpenAI/OpenAI.php index 920470b..77a2bd4 100644 --- a/app/Helpers/FirstParty/OpenAI/OpenAI.php +++ b/app/Helpers/FirstParty/OpenAI/OpenAI.php @@ -45,6 +45,30 @@ public static function writeArticle($title, $description, $article_type, $min, $ } + public static function createNewArticleTitle($current_title, $supporting_data) + { + $system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate an easy-to-read unique, helpful article title.\n\n + Requirements:\n + 2 descriptive photos keywords to represent article title when put together side-by-side\n + No realtime information required\n + No guides and how tos\n + No punctuation in titles especially colons :\n + 90-130 characters\n\n + + return in following json format {\"main_keyword\":\"(Main Keyword)\",\"title\":\"(Title in 90-130 letters)\",\"short_title\":\"(Short Title in 30-40 letters)\",\"article_type\":\"(How-tos|Guides|Interview|Review|Commentary|Feature|News|Editorial|Report|Research|Case-study|Overview|Tutorial|Update|Spotlight|Insights)\",\"description\":\"(SEO description based on main keyword)\",\"photo_keywords\":[\"photo keyword 1\",\"photo keyword 2\"]}"; + + + $user_prompt = "Article Title: {$current_title}\n Article Description: {$supporting_data}\n"; + + $reply = self::chatCompletion($system_prompt, $user_prompt, 'gpt-3.5-turbo'); + + try { + return json_decode($reply, false); + } catch (Exception $e) { + return null; + } + } + public static function suggestArticleTitles($current_title, $supporting_data, $suggestion_counts) { $system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate {$suggestion_counts} easy-to-read unique, helpful title articles.\n\n diff --git a/app/Jobs/Tasks/GenerateArticleTask.php b/app/Jobs/Tasks/GenerateArticleTask.php index f4b7489..a360cd9 100644 --- a/app/Jobs/Tasks/GenerateArticleTask.php +++ b/app/Jobs/Tasks/GenerateArticleTask.php @@ -14,43 +14,52 @@ class GenerateArticleTask public static function handle(SerpUrl $serp_url) { - $ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1); + // $ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1); - if (is_null($ai_titles)) { - return self::saveAndReturnSerpProcessStatus($serp_url, -2); + // if (is_null($ai_titles)) { + // return self::saveAndReturnSerpProcessStatus($serp_url, -2); + // } + + // $suggestion = null; + + // // dump($ai_titles); + + // try { + // $random_key = array_rand($ai_titles?->suggestions, 1); + + // $suggestion = $ai_titles->suggestions[$random_key]; + + // } catch (Exception $e) { + // return self::saveAndReturnSerpProcessStatus($serp_url, -1); + // } + + // if (is_null($suggestion)) { + // return self::saveAndReturnSerpProcessStatus($serp_url, -3); + // } + + $ai_suggestion = OpenAI::createNewArticleTitle($serp_url->title, $serp_url->description); + + $readability_content = ScrapeUrlBodyTask::handle($serp_url->url); + + if (is_null($readability_content)) + { + return self::saveAndReturnSerpProcessStatus($serp_url, -7); } - $suggestion = null; - - // dump($ai_titles); - - try { - $random_key = array_rand($ai_titles?->suggestions, 1); - - $suggestion = $ai_titles->suggestions[$random_key]; - - } catch (Exception $e) { - return self::saveAndReturnSerpProcessStatus($serp_url, -1); - } - - if (is_null($suggestion)) { - return self::saveAndReturnSerpProcessStatus($serp_url, -3); - } - - $markdown = OpenAI::writeArticle($suggestion->title, $suggestion->description, $suggestion->article_type, 500, 800); + $markdown = OpenAI::writeArticle($ai_suggestion->title, $readability_content, $ai_suggestion->article_type ,500, 800); if (is_empty($markdown)) { return self::saveAndReturnSerpProcessStatus($serp_url, -4); } $post = new Post; - $post->title = $suggestion->title; - $post->type = $suggestion->article_type; - $post->short_title = $suggestion->short_title; - $post->main_keyword = $ai_titles->main_keyword; - $post->keywords = $suggestion->photo_keywords; - $post->slug = str_slug($suggestion->title); - $post->excerpt = $suggestion->description; + $post->title = $ai_suggestion->title; + $post->type = $ai_suggestion->article_type; + $post->short_title = $ai_suggestion->short_title; + $post->main_keyword = $ai_suggestion->main_keyword; + $post->keywords = $ai_suggestion->photo_keywords; + $post->slug = str_slug($ai_suggestion->title); + $post->excerpt = $ai_suggestion->description; $post->author_id = Author::find(1)->id; $post->featured = false; $post->featured_image = null; diff --git a/app/Jobs/Tasks/ScrapeUrlBodyTask.php b/app/Jobs/Tasks/ScrapeUrlBodyTask.php new file mode 100644 index 0000000..3842ce3 --- /dev/null +++ b/app/Jobs/Tasks/ScrapeUrlBodyTask.php @@ -0,0 +1,73 @@ + "http://$user:$psw@$proxy", + ])->get($url); + + if ($response->successful()) { + $html_content = $response->body(); + + OSSUploader::uploadFile('r2','/scraped/',$slug.'.html', $html_content); + } + } + + //dump("Initial: " . strlen($html_content)); + + $readability = new Readability(new Configuration()); + + + try { + $readability->parse($html_content); + + $html_content = strip_tags($readability->getContent()); + //dd($readability); + } catch (ParseException $e) { + + } + + //dump("After: " . strlen($html_content)); + + return $html_content; + + } +} \ No newline at end of file diff --git a/composer.json b/composer.json index f15a12d..568f94d 100644 --- a/composer.json +++ b/composer.json @@ -11,6 +11,7 @@ "php": "^8.1", "artesaos/seotools": "^1.2", "dipeshsukhia/laravel-html-minify": "^3.3", + "fivefilters/readability.php": "^1.0", "graham-campbell/markdown": "^15.0", "guzzlehttp/guzzle": "^7.2", "intervention/image": "^2.7", diff --git a/composer.lock b/composer.lock index 0abdb74..a730e69 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "6befdc23980f16c166f9c783aa9cabf5", + "content-hash": "b6f2e806cb687a1b5a4a3872968a2157", "packages": [ { "name": "artesaos/seotools", @@ -720,6 +720,57 @@ ], "time": "2023-01-14T14:17:03+00:00" }, + { + "name": "fivefilters/readability.php", + "version": "v1.0.0", + "source": { + "type": "git", + "url": "https://github.com/fivefilters/readability.php.git", + "reference": "0d02e2916d1659bb79426969c5d48848ff402598" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/fivefilters/readability.php/zipball/0d02e2916d1659bb79426969c5d48848ff402598", + "reference": "0d02e2916d1659bb79426969c5d48848ff402598", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-mbstring": "*", + "ext-xml": "*", + "php": ">=5.6.0" + }, + "require-dev": { + "phpunit/phpunit": "^5.7" + }, + "type": "library", + "autoload": { + "psr-4": { + "andreskrey\\Readability\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Apache-2.0" + ], + "authors": [ + { + "name": "Andres Rey", + "email": "andreskrey@gmail.com", + "role": "Lead Developer" + } + ], + "description": "A PHP port of Readability.js", + "homepage": "https://github.com/andreskrey/readability", + "keywords": [ + "html", + "readability" + ], + "support": { + "source": "https://github.com/fivefilters/readability.php/tree/v1.0.0" + }, + "time": "2017-12-03T12:24:28+00:00" + }, { "name": "fruitcake/php-cors", "version": "v1.2.0", diff --git a/routes/tests.php b/routes/tests.php index f18ec1d..b6e5428 100644 --- a/routes/tests.php +++ b/routes/tests.php @@ -5,10 +5,12 @@ use App\Jobs\GenerateArticleJob; use App\Jobs\Tasks\GetNewsSerpTask; use App\Jobs\Tasks\ParseNewsSerpDomainsTask; +use App\Jobs\Tasks\ScrapeUrlBodyTask; use App\Models\Category; use App\Models\NewsSerpResult; use App\Models\Post; use App\Models\SerpUrl; +use Illuminate\Support\Facades\Request; use Illuminate\Support\Facades\Route; /* @@ -22,40 +24,82 @@ | */ -Route::get('/news_serp', function () { - $category = Category::find(2); +Route::get('/step-1', function (Request $request) { + $category = Category::find($request->input('id')); $news_serp_result = GetNewsSerpTask::handle($category, 'US'); - dd($news_serp_result); + dd($news_serp_result->id); }); -Route::get('/news_serp_parse', function () { - $news_serp_result = NewsSerpResult::find(3); - $serp_urls = ParseNewsSerpDomainsTask::handle($news_serp_result); - dd($serp_urls); +Route::get('/step-2', function (Request $request) { + $news_serp_result = NewsSerpResult::find($request->input('id', null)); + + if (is_null($news_serp_result)) + { + abort(404); + + } + $task = ParseNewsSerpDomainsTask::handle($news_serp_result); + + if ($task) + { + $serp_url = SerpUrl::latest()->first(); + + dd($serp_url->id); + } }); -Route::get('/write_article', function () { - $serp_url = SerpUrl::find(2); +Route::get('/step-3', function (Request $request) { + $serp_url = SerpUrl::find($request->input('id', null)); + + if (is_null($serp_url)) + { + abort(404); + } + $task = GenerateArticleJob::dispatch($serp_url)->onQueue('default')->onConnection('default'); dd($task); }); -Route::get('/gen_article_image', function () { +Route::get('/step-4', function () { $post = Post::whereNull('featured_image')->where('status', 'draft')->first(); + $task = GenerateArticleFeaturedImageJob::dispatch($post)->onQueue('default')->onConnection('default'); dd($task); }); -Route::get('/suggest_titles', function () { - $results = OpenAI::suggestArticleTitles("It's 2019s Electric: How Fisker Is Reinventing The Automotive Industry And \nExpanding Its Business", "Fisker's approach to building electric vehicles is deeply intertwined with \nits overall business philosophy: use less, use better,...s", 1); - dd($results); +Route::get('/step-5', function (Request $request) { + $post = Post::find($request->input('id')); + + if (is_null($post)) + { + return abort(404); + } + + $post->published_at = now(); + dd($post->save()); }); -Route::get('/write_article_raw', function () { - $results = OpenAI::writeArticle("Fisker's Vision for the Future of Electric Cars", "Explore Fisker's innovative vision for the future of electric cars and its impact on the automotive industry.", 'Article', 500, 800); - dd($results); + +// Route::get('/suggest_titles', function () { +// $results = OpenAI::suggestArticleTitles("It's 2019s Electric: How Fisker Is Reinventing The Automotive Industry And \nExpanding Its Business", "Fisker's approach to building electric vehicles is deeply intertwined with \nits overall business philosophy: use less, use better,...s", 1); +// dd($results); +// }); + +// Route::get('/write_article_raw', function () { +// $results = OpenAI::writeArticle("Fisker's Vision for the Future of Electric Cars", "Explore Fisker's innovative vision for the future of electric cars and its impact on the automotive industry.", 'Article', 500, 800); +// dd($results); +// }); + + + +Route::get('proxy_test', function() { + $url = 'https://www.cnbc.com/2023/09/24/this-southern-city-is-the-no-1-place-to-start-your-own-business.html'; + + $task = ScrapeUrlBodyTask::handle($url); + + dd($task); }); // Route::get('/image_gen', function() {