Files
aibuddytool/app/Jobs/Tasks/GetUrlBodyTask.php
2023-11-29 21:16:13 +08:00

216 lines
6.3 KiB
PHP

<?php
namespace App\Jobs\Tasks;
use App\Jobs\ParseUrlBodyJob;
use App\Models\ServiceCostUsage;
use App\Models\UrlToCrawl;
use Exception;
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
use fivefilters\Readability\ParseException as ReadabilityParseException;
use fivefilters\Readability\Readability;
use Illuminate\Support\Facades\Http;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\Component\DomCrawler\Crawler;
class GetUrlBodyTask
{
public static function handle(int $url_to_crawl_id)
{
$url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
if (is_null($url_to_crawl)) {
return null;
}
if (in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
return;
}
$enable_proxy = false;
$url_to_crawl->is_crawling = true;
$url_to_crawl->save();
$url_to_crawl->refresh();
// try {
$user_agent = config('platform.proxy.user_agent');
$response = Http::withHeaders([
'User-Agent' => $user_agent,
])
->withOptions([
'proxy' => ($enable_proxy) ? get_smartproxy_rotating_server() : null,
'timeout' => 10,
'verify' => false,
])
->get($url_to_crawl->url);
if ($response->successful()) {
$raw_html = $response->body();
if ($enable_proxy) {
$cost = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
$service_cost_usage = new ServiceCostUsage;
$service_cost_usage->cost = $cost;
$service_cost_usage->name = 'smartproxy-GetUrlBodyTask';
$service_cost_usage->reference_1 = 'url_to_crawl';
$service_cost_usage->reference_2 = strval($url_to_crawl_id);
$service_cost_usage->output = self::getMarkdownFromHtml($raw_html);
$service_cost_usage->save();
}
} else {
$raw_html = null;
$response->throw();
}
// } catch (Exception $e) {
// $raw_html = null;
// //throw $e;
// }
$markdown_output = self::getMarkdownFromHtml($raw_html);
if (! is_empty($markdown_output)) {
$url_to_crawl->output_type = 'markdown';
$url_to_crawl->output = $markdown_output;
} else {
$url_to_crawl->output = 'EMPTY CONTENT';
$url_to_crawl->status = 'blocked';
}
$url_to_crawl->is_crawled = true;
$url_to_crawl->crawl_counts = $url_to_crawl->crawl_counts + 1;
if ($url_to_crawl->save()) {
if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
}
}
}
private static function getMainImageFromHtml($html)
{
$r_configuration = new ReadabilityConfiguration();
$r_configuration->setCharThreshold(20);
$readability = new Readability($r_configuration);
try {
$readability->parse($html);
return $readability->getImage();
//dd($readability);
} catch (ReadabilityParseException $e) {
}
return null;
}
private static function getMarkdownFromHtml($html)
{
$converter = new HtmlConverter([
'strip_tags' => true,
'strip_placeholder_links' => true,
]);
$html = self::cleanHtml($html);
$markdown = $converter->convert($html);
//dd($markdown);
$markdown = self::reverseLTGT($markdown);
$markdown = self::normalizeNewLines($markdown);
$markdown = self::removeDuplicateLines($markdown);
return html_entity_decode(markdown_to_plaintext($markdown));
}
private static function reverseLTGT($input)
{
$output = str_replace('&lt;', '<', $input);
$output = str_replace('&gt;', '>', $output);
return $output;
}
private static function removeDuplicateLines($string)
{
$lines = explode("\n", $string);
$uniqueLines = array_unique($lines);
return implode("\n", $uniqueLines);
}
private static function normalizeNewLines($content)
{
// Split the content by lines
$lines = explode("\n", $content);
$processedLines = [];
for ($i = 0; $i < count($lines); $i++) {
$line = trim($lines[$i]);
// If the line is an image markdown
if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
// And if the next line is not empty and not another markdown structure
if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
$line .= ' '.trim($lines[$i + 1]);
$i++; // Skip the next line as we're merging it
}
}
// Add line to processedLines if it's not empty
if (! empty($line)) {
$processedLines[] = $line;
}
}
// Collapse excessive newlines
$result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
// Detect and replace the pattern
$result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
// Replace multiple spaces with a dash separator
$result = preg_replace('/ {2,}/', ' - ', $result);
return $result;
}
private static function cleanHtml($htmlContent)
{
$crawler = new Crawler($htmlContent);
// Define tags to remove completely
$tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
foreach ($tagsToRemove as $tag) {
$crawler->filter($tag)->each(function ($node) {
foreach ($node as $child) {
$child->parentNode->removeChild($child);
}
});
}
// Replace <span> tags with their inner content
$crawler->filter('span')->each(function ($node) {
$replacement = new \DOMText($node->text());
foreach ($node as $child) {
$child->parentNode->replaceChild($replacement, $child);
}
});
return $crawler->outerHtml();
}
}