Add (initial): futurewalker code
This commit is contained in:
207
app/Jobs/Tasks/CrawlUrlResearchTask.php
Normal file
207
app/Jobs/Tasks/CrawlUrlResearchTask.php
Normal file
@@ -0,0 +1,207 @@
|
||||
<?php
|
||||
|
||||
namespace App\Jobs\Tasks;
|
||||
|
||||
use App\Jobs\CrawlUrlResearchJob;
|
||||
use App\Jobs\WriteWithAIJob;
|
||||
use App\Models\SerpUrl;
|
||||
use App\Models\SerpUrlResearch;
|
||||
use Exception;
|
||||
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
|
||||
use fivefilters\Readability\ParseException as ReadabilityParseException;
|
||||
use fivefilters\Readability\Readability;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use League\HTMLToMarkdown\HtmlConverter;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class CrawlUrlResearchTask
|
||||
{
|
||||
public static function handle(int $serp_url_research_id)
|
||||
{
|
||||
$serp_url_research = SerpUrlResearch::find($serp_url_research_id);
|
||||
|
||||
if (is_null($serp_url_research)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$user_agent = config('platform.proxy.user_agent');
|
||||
|
||||
$response = Http::withHeaders([
|
||||
'User-Agent' => $user_agent,
|
||||
])
|
||||
->withOptions([
|
||||
'proxy' => get_smartproxy_rotating_server(),
|
||||
'timeout' => 10,
|
||||
'verify' => false,
|
||||
])
|
||||
->get($serp_url_research->url);
|
||||
|
||||
if ($response->successful()) {
|
||||
$raw_html = $response->body();
|
||||
$costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
|
||||
} else {
|
||||
$raw_html = null;
|
||||
$response->throw();
|
||||
}
|
||||
|
||||
} catch (Exception $e) {
|
||||
$raw_html = null;
|
||||
//throw $e;
|
||||
}
|
||||
|
||||
if (! is_empty($raw_html)) {
|
||||
//dump(self::getMarkdownFromHtml($raw_html));
|
||||
|
||||
$serp_url_research->content = self::getMarkdownFromHtml($raw_html);
|
||||
$serp_url_research->main_image = self::getMainImageFromHtml($raw_html);
|
||||
|
||||
//dump($serp_url_research->content);
|
||||
} else {
|
||||
$serp_url_research->content = 'EMPTY CONTENT';
|
||||
}
|
||||
|
||||
$serp_url_research->save();
|
||||
|
||||
$completed_serp_url_researches_counts = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->count();
|
||||
|
||||
if ($completed_serp_url_researches_counts >= 3) {
|
||||
$serp_url = SerpUrl::find($serp_url_research->serp_url_id);
|
||||
|
||||
if (! is_null($serp_url)) {
|
||||
$serp_url->crawled = true;
|
||||
$serp_url->save();
|
||||
|
||||
WriteWithAIJob::dispatch($serp_url->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
} else {
|
||||
$next_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->whereNull('content')->first();
|
||||
|
||||
if (! is_null($next_serp_url_research)) {
|
||||
CrawlUrlResearchJob::dispatch($next_serp_url_research->id)->onQueue('default')->onConnection('default');
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static function getMainImageFromHtml($html)
|
||||
{
|
||||
$r_configuration = new ReadabilityConfiguration();
|
||||
$r_configuration->setCharThreshold(20);
|
||||
|
||||
$readability = new Readability($r_configuration);
|
||||
|
||||
try {
|
||||
$readability->parse($html);
|
||||
|
||||
return $readability->getImage();
|
||||
//dd($readability);
|
||||
} catch (ReadabilityParseException $e) {
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static function getMarkdownFromHtml($html)
|
||||
{
|
||||
|
||||
$converter = new HtmlConverter([
|
||||
'strip_tags' => true,
|
||||
'strip_placeholder_links' => true,
|
||||
]);
|
||||
|
||||
$html = self::cleanHtml($html);
|
||||
|
||||
$markdown = $converter->convert($html);
|
||||
|
||||
//dd($markdown);
|
||||
|
||||
$markdown = self::reverseLTGT($markdown);
|
||||
|
||||
$markdown = self::normalizeNewLines($markdown);
|
||||
|
||||
$markdown = self::removeDuplicateLines($markdown);
|
||||
|
||||
return html_entity_decode(markdown_to_plaintext($markdown));
|
||||
}
|
||||
|
||||
private static function reverseLTGT($input)
|
||||
{
|
||||
$output = str_replace('<', '<', $input);
|
||||
$output = str_replace('>', '>', $output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
private static function removeDuplicateLines($string)
|
||||
{
|
||||
$lines = explode("\n", $string);
|
||||
$uniqueLines = array_unique($lines);
|
||||
|
||||
return implode("\n", $uniqueLines);
|
||||
}
|
||||
|
||||
private static function normalizeNewLines($content)
|
||||
{
|
||||
// Split the content by lines
|
||||
$lines = explode("\n", $content);
|
||||
|
||||
$processedLines = [];
|
||||
|
||||
for ($i = 0; $i < count($lines); $i++) {
|
||||
$line = trim($lines[$i]);
|
||||
|
||||
// If the line is an image markdown
|
||||
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
|
||||
// And if the next line is not empty and not another markdown structure
|
||||
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
|
||||
$line .= ' '.trim($lines[$i + 1]);
|
||||
$i++; // Skip the next line as we're merging it
|
||||
}
|
||||
}
|
||||
|
||||
// Add line to processedLines if it's not empty
|
||||
if (! empty($line)) {
|
||||
$processedLines[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse excessive newlines
|
||||
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
|
||||
|
||||
// Detect and replace the pattern
|
||||
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
|
||||
|
||||
// Replace multiple spaces with a dash separator
|
||||
$result = preg_replace('/ {2,}/', ' - ', $result);
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
private static function cleanHtml($htmlContent)
|
||||
{
|
||||
$crawler = new Crawler($htmlContent);
|
||||
|
||||
// Define tags to remove completely
|
||||
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
|
||||
|
||||
foreach ($tagsToRemove as $tag) {
|
||||
$crawler->filter($tag)->each(function ($node) {
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->removeChild($child);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Replace <span> tags with their inner content
|
||||
$crawler->filter('span')->each(function ($node) {
|
||||
$replacement = new \DOMText($node->text());
|
||||
|
||||
foreach ($node as $child) {
|
||||
$child->parentNode->replaceChild($replacement, $child);
|
||||
}
|
||||
});
|
||||
|
||||
return $crawler->outerHtml();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user