Add (scraper)

Update (ai): integrate scraper
This commit is contained in:
2023-09-25 19:39:13 +08:00
parent b6f36e3ccd
commit c53918d03b
7 changed files with 262 additions and 45 deletions

View File

@@ -25,6 +25,21 @@ public static function readJson($storage_driver, $relative_directory, $filename)
return null;
}
public static function readFile($storage_driver, $relative_directory, $filename)
{
$filepath = rtrim($relative_directory, '/').'/'.$filename;
try {
return Storage::disk($storage_driver)->get($filepath);
} catch (\Exception $e) {
return null;
}
return null;
}
public static function uploadJson($storage_driver, $relative_directory, $filename, $jsonData)
{
$jsonString = json_encode($jsonData, JSON_PRETTY_PRINT);

View File

@@ -45,6 +45,30 @@ public static function writeArticle($title, $description, $article_type, $min, $
}
public static function createNewArticleTitle($current_title, $supporting_data)
{
$system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate an easy-to-read unique, helpful article title.\n\n
Requirements:\n
2 descriptive photos keywords to represent article title when put together side-by-side\n
No realtime information required\n
No guides and how tos\n
No punctuation in titles especially colons :\n
90-130 characters\n\n
return in following json format {\"main_keyword\":\"(Main Keyword)\",\"title\":\"(Title in 90-130 letters)\",\"short_title\":\"(Short Title in 30-40 letters)\",\"article_type\":\"(How-tos|Guides|Interview|Review|Commentary|Feature|News|Editorial|Report|Research|Case-study|Overview|Tutorial|Update|Spotlight|Insights)\",\"description\":\"(SEO description based on main keyword)\",\"photo_keywords\":[\"photo keyword 1\",\"photo keyword 2\"]}";
$user_prompt = "Article Title: {$current_title}\n Article Description: {$supporting_data}\n";
$reply = self::chatCompletion($system_prompt, $user_prompt, 'gpt-3.5-turbo');
try {
return json_decode($reply, false);
} catch (Exception $e) {
return null;
}
}
public static function suggestArticleTitles($current_title, $supporting_data, $suggestion_counts)
{
$system_prompt = "Based on provided article title, identify the main keyword in 1-2 words. Once identified, use the main keyword only to generate {$suggestion_counts} easy-to-read unique, helpful title articles.\n\n

View File

@@ -14,43 +14,52 @@ class GenerateArticleTask
public static function handle(SerpUrl $serp_url)
{
$ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1);
// $ai_titles = OpenAI::suggestArticleTitles($serp_url->title, $serp_url->description, 1);
if (is_null($ai_titles)) {
return self::saveAndReturnSerpProcessStatus($serp_url, -2);
// if (is_null($ai_titles)) {
// return self::saveAndReturnSerpProcessStatus($serp_url, -2);
// }
// $suggestion = null;
// // dump($ai_titles);
// try {
// $random_key = array_rand($ai_titles?->suggestions, 1);
// $suggestion = $ai_titles->suggestions[$random_key];
// } catch (Exception $e) {
// return self::saveAndReturnSerpProcessStatus($serp_url, -1);
// }
// if (is_null($suggestion)) {
// return self::saveAndReturnSerpProcessStatus($serp_url, -3);
// }
$ai_suggestion = OpenAI::createNewArticleTitle($serp_url->title, $serp_url->description);
$readability_content = ScrapeUrlBodyTask::handle($serp_url->url);
if (is_null($readability_content))
{
return self::saveAndReturnSerpProcessStatus($serp_url, -7);
}
$suggestion = null;
// dump($ai_titles);
try {
$random_key = array_rand($ai_titles?->suggestions, 1);
$suggestion = $ai_titles->suggestions[$random_key];
} catch (Exception $e) {
return self::saveAndReturnSerpProcessStatus($serp_url, -1);
}
if (is_null($suggestion)) {
return self::saveAndReturnSerpProcessStatus($serp_url, -3);
}
$markdown = OpenAI::writeArticle($suggestion->title, $suggestion->description, $suggestion->article_type, 500, 800);
$markdown = OpenAI::writeArticle($ai_suggestion->title, $readability_content, $ai_suggestion->article_type ,500, 800);
if (is_empty($markdown)) {
return self::saveAndReturnSerpProcessStatus($serp_url, -4);
}
$post = new Post;
$post->title = $suggestion->title;
$post->type = $suggestion->article_type;
$post->short_title = $suggestion->short_title;
$post->main_keyword = $ai_titles->main_keyword;
$post->keywords = $suggestion->photo_keywords;
$post->slug = str_slug($suggestion->title);
$post->excerpt = $suggestion->description;
$post->title = $ai_suggestion->title;
$post->type = $ai_suggestion->article_type;
$post->short_title = $ai_suggestion->short_title;
$post->main_keyword = $ai_suggestion->main_keyword;
$post->keywords = $ai_suggestion->photo_keywords;
$post->slug = str_slug($ai_suggestion->title);
$post->excerpt = $ai_suggestion->description;
$post->author_id = Author::find(1)->id;
$post->featured = false;
$post->featured_image = null;

View File

@@ -0,0 +1,73 @@
<?php
namespace App\Jobs\Tasks;
use App\Helpers\FirstParty\OSSUploader\OSSUploader;
use \Illuminate\Support\Facades\Http;
use Carbon\Carbon;
use Storage;
use Exception;
use andreskrey\Readability\Readability;
use andreskrey\Readability\Configuration;
use andreskrey\Readability\ParseException;
class ScrapeUrlBodyTask
{
public static function handle(string $url)
{
$slug = str_slug($url);
$disk_url = '/scraped/' . $slug . '.html';
$html_content = null;
try {
$html_content = OSSUploader::readFile('r2','/scraped/',$slug.'.html');
if (is_null($disk_url))
{
throw Exception('Not stored.');
}
}
catch (Exception $e) {
$html_content = null;
}
if (is_null($html_content))
{
$proxy = 'gate.smartproxy.com:10000';
$user = 'sp5bbkzj7e';
$psw = 'yTtk2cc5kg23kIkSSr';
$response = Http::withOptions([
'proxy' => "http://$user:$psw@$proxy",
])->get($url);
if ($response->successful()) {
$html_content = $response->body();
OSSUploader::uploadFile('r2','/scraped/',$slug.'.html', $html_content);
}
}
//dump("Initial: " . strlen($html_content));
$readability = new Readability(new Configuration());
try {
$readability->parse($html_content);
$html_content = strip_tags($readability->getContent());
//dd($readability);
} catch (ParseException $e) {
}
//dump("After: " . strlen($html_content));
return $html_content;
}
}

View File

@@ -11,6 +11,7 @@
"php": "^8.1",
"artesaos/seotools": "^1.2",
"dipeshsukhia/laravel-html-minify": "^3.3",
"fivefilters/readability.php": "^1.0",
"graham-campbell/markdown": "^15.0",
"guzzlehttp/guzzle": "^7.2",
"intervention/image": "^2.7",

53
composer.lock generated
View File

@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "6befdc23980f16c166f9c783aa9cabf5",
"content-hash": "b6f2e806cb687a1b5a4a3872968a2157",
"packages": [
{
"name": "artesaos/seotools",
@@ -720,6 +720,57 @@
],
"time": "2023-01-14T14:17:03+00:00"
},
{
"name": "fivefilters/readability.php",
"version": "v1.0.0",
"source": {
"type": "git",
"url": "https://github.com/fivefilters/readability.php.git",
"reference": "0d02e2916d1659bb79426969c5d48848ff402598"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/0d02e2916d1659bb79426969c5d48848ff402598",
"reference": "0d02e2916d1659bb79426969c5d48848ff402598",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-mbstring": "*",
"ext-xml": "*",
"php": ">=5.6.0"
},
"require-dev": {
"phpunit/phpunit": "^5.7"
},
"type": "library",
"autoload": {
"psr-4": {
"andreskrey\\Readability\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Apache-2.0"
],
"authors": [
{
"name": "Andres Rey",
"email": "andreskrey@gmail.com",
"role": "Lead Developer"
}
],
"description": "A PHP port of Readability.js",
"homepage": "https://github.com/andreskrey/readability",
"keywords": [
"html",
"readability"
],
"support": {
"source": "https://github.com/fivefilters/readability.php/tree/v1.0.0"
},
"time": "2017-12-03T12:24:28+00:00"
},
{
"name": "fruitcake/php-cors",
"version": "v1.2.0",

View File

@@ -5,10 +5,12 @@
use App\Jobs\GenerateArticleJob;
use App\Jobs\Tasks\GetNewsSerpTask;
use App\Jobs\Tasks\ParseNewsSerpDomainsTask;
use App\Jobs\Tasks\ScrapeUrlBodyTask;
use App\Models\Category;
use App\Models\NewsSerpResult;
use App\Models\Post;
use App\Models\SerpUrl;
use Illuminate\Support\Facades\Request;
use Illuminate\Support\Facades\Route;
/*
@@ -22,40 +24,82 @@
|
*/
Route::get('/news_serp', function () {
$category = Category::find(2);
Route::get('/step-1', function (Request $request) {
$category = Category::find($request->input('id'));
$news_serp_result = GetNewsSerpTask::handle($category, 'US');
dd($news_serp_result);
dd($news_serp_result->id);
});
Route::get('/news_serp_parse', function () {
$news_serp_result = NewsSerpResult::find(3);
$serp_urls = ParseNewsSerpDomainsTask::handle($news_serp_result);
dd($serp_urls);
Route::get('/step-2', function (Request $request) {
$news_serp_result = NewsSerpResult::find($request->input('id', null));
if (is_null($news_serp_result))
{
abort(404);
}
$task = ParseNewsSerpDomainsTask::handle($news_serp_result);
if ($task)
{
$serp_url = SerpUrl::latest()->first();
dd($serp_url->id);
}
});
Route::get('/write_article', function () {
$serp_url = SerpUrl::find(2);
Route::get('/step-3', function (Request $request) {
$serp_url = SerpUrl::find($request->input('id', null));
if (is_null($serp_url))
{
abort(404);
}
$task = GenerateArticleJob::dispatch($serp_url)->onQueue('default')->onConnection('default');
dd($task);
});
Route::get('/gen_article_image', function () {
Route::get('/step-4', function () {
$post = Post::whereNull('featured_image')->where('status', 'draft')->first();
$task = GenerateArticleFeaturedImageJob::dispatch($post)->onQueue('default')->onConnection('default');
dd($task);
});
Route::get('/suggest_titles', function () {
$results = OpenAI::suggestArticleTitles("It's 2019s Electric: How Fisker Is Reinventing The Automotive Industry And \nExpanding Its Business", "Fisker's approach to building electric vehicles is deeply intertwined with \nits overall business philosophy: use less, use better,...s", 1);
dd($results);
Route::get('/step-5', function (Request $request) {
$post = Post::find($request->input('id'));
if (is_null($post))
{
return abort(404);
}
$post->published_at = now();
dd($post->save());
});
Route::get('/write_article_raw', function () {
$results = OpenAI::writeArticle("Fisker's Vision for the Future of Electric Cars", "Explore Fisker's innovative vision for the future of electric cars and its impact on the automotive industry.", 'Article', 500, 800);
dd($results);
// Route::get('/suggest_titles', function () {
// $results = OpenAI::suggestArticleTitles("It's 2019s Electric: How Fisker Is Reinventing The Automotive Industry And \nExpanding Its Business", "Fisker's approach to building electric vehicles is deeply intertwined with \nits overall business philosophy: use less, use better,...s", 1);
// dd($results);
// });
// Route::get('/write_article_raw', function () {
// $results = OpenAI::writeArticle("Fisker's Vision for the Future of Electric Cars", "Explore Fisker's innovative vision for the future of electric cars and its impact on the automotive industry.", 'Article', 500, 800);
// dd($results);
// });
Route::get('proxy_test', function() {
$url = 'https://www.cnbc.com/2023/09/24/this-southern-city-is-the-no-1-place-to-start-your-own-business.html';
$task = ScrapeUrlBodyTask::handle($url);
dd($task);
});
// Route::get('/image_gen', function() {