Sync

2023-11-26 18:56:40 +08:00
parent be14f5fdb1
commit 64431e7a73
144 changed files with 497072 additions and 3730 deletions
--- a/app/Jobs/Tasks/GetUrlBodyTask.php
+++ b/app/Jobs/Tasks/GetUrlBodyTask.php
@@ -0,0 +1,204 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Jobs\ParseUrlBodyJob;
+use App\Models\ServiceCostUsage;
+use App\Models\UrlToCrawl;
+use Exception;
+use fivefilters\Readability\Configuration as ReadabilityConfiguration;
+use fivefilters\Readability\ParseException as ReadabilityParseException;
+use fivefilters\Readability\Readability;
+use Illuminate\Support\Facades\Http;
+use League\HTMLToMarkdown\HtmlConverter;
+use Symfony\Component\DomCrawler\Crawler;
+
+class GetUrlBodyTask
+{
+    public static function handle(int $url_to_crawl_id)
+    {
+        $url_to_crawl = UrlToCrawl::find($url_to_crawl_id);
+
+        if (is_null($url_to_crawl)) {
+            return null;
+        }
+
+        $url_to_crawl->is_crawling = true;
+        $url_to_crawl->save();
+        $url_to_crawl->refresh();
+
+        try {
+            $user_agent = config('platform.proxy.user_agent');
+
+            $response = Http::withHeaders([
+                'User-Agent' => $user_agent,
+            ])
+                ->withOptions([
+                    'proxy' => get_smartproxy_rotating_server(),
+                    'timeout' => 10,
+                    'verify' => false,
+                ])
+                ->get($url_to_crawl->url);
+
+            if ($response->successful()) {
+                $raw_html = $response->body();
+                $cost = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
+
+                $service_cost_usage = new ServiceCostUsage;
+                $service_cost_usage->cost = $cost;
+                $service_cost_usage->name = 'smartproxy-GetUrlBodyTask';
+                $service_cost_usage->reference_1 = 'url_to_crawl';
+                $service_cost_usage->reference_2 = strval($url_to_crawl_id);
+                $service_cost_usage->output = self::getMarkdownFromHtml($raw_html);
+                $service_cost_usage->save();
+
+            } else {
+                $raw_html = null;
+                $response->throw();
+            }
+
+        } catch (Exception $e) {
+            $raw_html = null;
+            //throw $e;
+        }
+
+        if (! is_empty($raw_html)) {
+            $url_to_crawl->output_type = 'markdown';
+            $url_to_crawl->output = self::getMarkdownFromHtml($raw_html);
+
+        } else {
+            $url_to_crawl->output = 'EMPTY CONTENT';
+            $url_to_crawl->status = 'blocked';
+        }
+
+        $url_to_crawl->is_crawled = true;
+
+        if ($url_to_crawl->save()) {
+            if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) {
+                ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default');
+            }
+
+        }
+    }
+
+    private static function getMainImageFromHtml($html)
+    {
+        $r_configuration = new ReadabilityConfiguration();
+        $r_configuration->setCharThreshold(20);
+
+        $readability = new Readability($r_configuration);
+
+        try {
+            $readability->parse($html);
+
+            return $readability->getImage();
+            //dd($readability);
+        } catch (ReadabilityParseException $e) {
+        }
+
+        return null;
+    }
+
+    private static function getMarkdownFromHtml($html)
+    {
+
+        $converter = new HtmlConverter([
+            'strip_tags' => true,
+            'strip_placeholder_links' => true,
+        ]);
+
+        $html = self::cleanHtml($html);
+
+        $markdown = $converter->convert($html);
+
+        //dd($markdown);
+
+        $markdown = self::reverseLTGT($markdown);
+
+        $markdown = self::normalizeNewLines($markdown);
+
+        $markdown = self::removeDuplicateLines($markdown);
+
+        return html_entity_decode(markdown_to_plaintext($markdown));
+    }
+
+    private static function reverseLTGT($input)
+    {
+        $output = str_replace('&lt;', '<', $input);
+        $output = str_replace('&gt;', '>', $output);
+
+        return $output;
+    }
+
+    private static function removeDuplicateLines($string)
+    {
+        $lines = explode("\n", $string);
+        $uniqueLines = array_unique($lines);
+
+        return implode("\n", $uniqueLines);
+    }
+
+    private static function normalizeNewLines($content)
+    {
+        // Split the content by lines
+        $lines = explode("\n", $content);
+
+        $processedLines = [];
+
+        for ($i = 0; $i < count($lines); $i++) {
+            $line = trim($lines[$i]);
+
+            // If the line is an image markdown
+            if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
+                // And if the next line is not empty and not another markdown structure
+                if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
+                    $line .= ' '.trim($lines[$i + 1]);
+                    $i++; // Skip the next line as we're merging it
+                }
+            }
+
+            // Add line to processedLines if it's not empty
+            if (! empty($line)) {
+                $processedLines[] = $line;
+            }
+        }
+
+        // Collapse excessive newlines
+        $result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
+
+        // Detect and replace the pattern
+        $result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
+
+        // Replace multiple spaces with a dash separator
+        $result = preg_replace('/ {2,}/', ' - ', $result);
+
+        return $result;
+    }
+
+    private static function cleanHtml($htmlContent)
+    {
+        $crawler = new Crawler($htmlContent);
+
+        // Define tags to remove completely
+        $tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
+
+        foreach ($tagsToRemove as $tag) {
+            $crawler->filter($tag)->each(function ($node) {
+                foreach ($node as $child) {
+                    $child->parentNode->removeChild($child);
+                }
+            });
+        }
+
+        // Replace <span> tags with their inner content
+        $crawler->filter('span')->each(function ($node) {
+            $replacement = new \DOMText($node->text());
+
+            foreach ($node as $child) {
+                $child->parentNode->replaceChild($replacement, $child);
+            }
+        });
+
+        return $crawler->outerHtml();
+    }
+}