aibuddytool/app/Jobs/Tasks/GetUrlBodyTask.php

<?php

namespace App\Jobs\Tasks;

use App\Jobs\ParseUrlBodyJob;
use App\Models\ServiceCostUsage;
use App\Models\UrlToCrawl;
use Exception;
use fivefilters\Readability\Configuration as ReadabilityConfiguration;
use fivefilters\Readability\ParseException as ReadabilityParseException;
use fivefilters\Readability\Readability;
use Illuminate\Support\Facades\Http;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\Component\DomCrawler\Crawler;

class GetUrlBodyTask
{
    public static function handle(int $url_to_crawl_id)
    {
        $url_to_crawl = UrlToCrawl::find($url_to_crawl_id);

        if (is_null($url_to_crawl)) {
            return null;
        }

        if (in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
            return;
        }

        $enable_proxy = false;

        $url_to_crawl->is_crawling = true;
        $url_to_crawl->save();
        $url_to_crawl->refresh();

        // try {
        $user_agent = config('platform.proxy.user_agent');

        $response = Http::withHeaders([
            'User-Agent' => $user_agent,
        ])
            ->withOptions([
                'proxy' => ($enable_proxy) ? get_smartproxy_rotating_server() : null,
                'timeout' => 10,
                'verify' => false,
            ])
            ->get($url_to_crawl->url);

        if ($response->successful()) {
            $raw_html = $response->body();
            if ($enable_proxy) {
                $cost = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');

                $service_cost_usage = new ServiceCostUsage;
                $service_cost_usage->cost = $cost;
                $service_cost_usage->name = 'smartproxy-GetUrlBodyTask';
                $service_cost_usage->reference_1 = 'url_to_crawl';
                $service_cost_usage->reference_2 = strval($url_to_crawl_id);
                $service_cost_usage->output = self::getMarkdownFromHtml($raw_html);
                $service_cost_usage->save();
            }

        } else {
            $raw_html = null;
            $response->throw();
        }

        // } catch (Exception $e) {
        //     $raw_html = null;
        //     //throw $e;
        // }

        $markdown_output = self::getMarkdownFromHtml($raw_html);

        if (! is_empty($markdown_output)) {
            $url_to_crawl->output_type = 'markdown';
            $url_to_crawl->output = $markdown_output;

        } else {
            $url_to_crawl->output = 'EMPTY CONTENT';
            $url_to_crawl->status = 'blocked';
        }

        $url_to_crawl->is_crawled = true;
        $url_to_crawl->crawl_counts = $url_to_crawl->crawl_counts + 1;

        if ($url_to_crawl->save()) {
            if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
                ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
            }

        }
    }

    private static function getMainImageFromHtml($html)
    {
        $r_configuration = new ReadabilityConfiguration();
        $r_configuration->setCharThreshold(20);

        $readability = new Readability($r_configuration);

        try {
            $readability->parse($html);

            return $readability->getImage();
            //dd($readability);
        } catch (ReadabilityParseException $e) {
        }

        return null;
    }

    private static function getMarkdownFromHtml($html)
    {

        $converter = new HtmlConverter([
            'strip_tags' => true,
            'strip_placeholder_links' => true,
        ]);

        $html = self::cleanHtml($html);

        $markdown = $converter->convert($html);

        //dd($markdown);

        $markdown = self::reverseLTGT($markdown);

        $markdown = self::normalizeNewLines($markdown);

        $markdown = self::removeDuplicateLines($markdown);

        return html_entity_decode(markdown_to_plaintext($markdown));
    }

    private static function reverseLTGT($input)
    {
        $output = str_replace('&lt;', '<', $input);
        $output = str_replace('&gt;', '>', $output);

        return $output;
    }

    private static function removeDuplicateLines($string)
    {
        $lines = explode("\n", $string);
        $uniqueLines = array_unique($lines);

        return implode("\n", $uniqueLines);
    }

    private static function normalizeNewLines($content)
    {
        // Split the content by lines
        $lines = explode("\n", $content);

        $processedLines = [];

        for ($i = 0; $i < count($lines); $i++) {
            $line = trim($lines[$i]);

            // If the line is an image markdown
            if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
                // And if the next line is not empty and not another markdown structure
                if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
                    $line .= ' '.trim($lines[$i + 1]);
                    $i++; // Skip the next line as we're merging it
                }
            }

            // Add line to processedLines if it's not empty
            if (! empty($line)) {
                $processedLines[] = $line;
            }
        }

        // Collapse excessive newlines
        $result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));

        // Detect and replace the pattern
        $result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);

        // Replace multiple spaces with a dash separator
        $result = preg_replace('/ {2,}/', ' - ', $result);

        return $result;
    }

    private static function cleanHtml($htmlContent)
    {
        $crawler = new Crawler($htmlContent);

        // Define tags to remove completely
        $tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];

        foreach ($tagsToRemove as $tag) {
            $crawler->filter($tag)->each(function ($node) {
                foreach ($node as $child) {
                    $child->parentNode->removeChild($child);
                }
            });
        }

        // Replace <span> tags with their inner content
        $crawler->filter('span')->each(function ($node) {
            $replacement = new \DOMText($node->text());

            foreach ($node as $child) {
                $child->parentNode->replaceChild($replacement, $child);
            }
        });

        return $crawler->outerHtml();
    }
}