Add (initial): futurewalker code

2023-11-20 00:15:18 +08:00
parent f8602cb456
commit 9ce3e5c82a
166 changed files with 15941 additions and 1072 deletions
--- a/app/Jobs/Tasks/CrawlUrlResearchTask.php
+++ b/app/Jobs/Tasks/CrawlUrlResearchTask.php
@@ -0,0 +1,207 @@
+<?php
+
+namespace App\Jobs\Tasks;
+
+use App\Jobs\CrawlUrlResearchJob;
+use App\Jobs\WriteWithAIJob;
+use App\Models\SerpUrl;
+use App\Models\SerpUrlResearch;
+use Exception;
+use fivefilters\Readability\Configuration as ReadabilityConfiguration;
+use fivefilters\Readability\ParseException as ReadabilityParseException;
+use fivefilters\Readability\Readability;
+use Illuminate\Support\Facades\Http;
+use League\HTMLToMarkdown\HtmlConverter;
+use Symfony\Component\DomCrawler\Crawler;
+
+class CrawlUrlResearchTask
+{
+    public static function handle(int $serp_url_research_id)
+    {
+        $serp_url_research = SerpUrlResearch::find($serp_url_research_id);
+
+        if (is_null($serp_url_research)) {
+            return null;
+        }
+
+        try {
+            $user_agent = config('platform.proxy.user_agent');
+
+            $response = Http::withHeaders([
+                'User-Agent' => $user_agent,
+            ])
+                ->withOptions([
+                    'proxy' => get_smartproxy_rotating_server(),
+                    'timeout' => 10,
+                    'verify' => false,
+                ])
+                ->get($serp_url_research->url);
+
+            if ($response->successful()) {
+                $raw_html = $response->body();
+                $costs['unblocker'] = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global');
+            } else {
+                $raw_html = null;
+                $response->throw();
+            }
+
+        } catch (Exception $e) {
+            $raw_html = null;
+            //throw $e;
+        }
+
+        if (! is_empty($raw_html)) {
+            //dump(self::getMarkdownFromHtml($raw_html));
+
+            $serp_url_research->content = self::getMarkdownFromHtml($raw_html);
+            $serp_url_research->main_image = self::getMainImageFromHtml($raw_html);
+
+        //dump($serp_url_research->content);
+        } else {
+            $serp_url_research->content = 'EMPTY CONTENT';
+        }
+
+        $serp_url_research->save();
+
+        $completed_serp_url_researches_counts = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->where('content', '!=', 'EMPTY CONTENT')->whereNotNull('content')->count();
+
+        if ($completed_serp_url_researches_counts >= 3) {
+            $serp_url = SerpUrl::find($serp_url_research->serp_url_id);
+
+            if (! is_null($serp_url)) {
+                $serp_url->crawled = true;
+                $serp_url->save();
+
+                WriteWithAIJob::dispatch($serp_url->id)->onQueue('default')->onConnection('default');
+            }
+        } else {
+            $next_serp_url_research = SerpUrlResearch::where('serp_url_id', $serp_url_research->serp_url_id)->whereNull('content')->first();
+
+            if (! is_null($next_serp_url_research)) {
+                CrawlUrlResearchJob::dispatch($next_serp_url_research->id)->onQueue('default')->onConnection('default');
+            }
+
+        }
+    }
+
+    private static function getMainImageFromHtml($html)
+    {
+        $r_configuration = new ReadabilityConfiguration();
+        $r_configuration->setCharThreshold(20);
+
+        $readability = new Readability($r_configuration);
+
+        try {
+            $readability->parse($html);
+
+            return $readability->getImage();
+            //dd($readability);
+        } catch (ReadabilityParseException $e) {
+        }
+
+        return null;
+    }
+
+    private static function getMarkdownFromHtml($html)
+    {
+
+        $converter = new HtmlConverter([
+            'strip_tags' => true,
+            'strip_placeholder_links' => true,
+        ]);
+
+        $html = self::cleanHtml($html);
+
+        $markdown = $converter->convert($html);
+
+        //dd($markdown);
+
+        $markdown = self::reverseLTGT($markdown);
+
+        $markdown = self::normalizeNewLines($markdown);
+
+        $markdown = self::removeDuplicateLines($markdown);
+
+        return html_entity_decode(markdown_to_plaintext($markdown));
+    }
+
+    private static function reverseLTGT($input)
+    {
+        $output = str_replace('&lt;', '<', $input);
+        $output = str_replace('&gt;', '>', $output);
+
+        return $output;
+    }
+
+    private static function removeDuplicateLines($string)
+    {
+        $lines = explode("\n", $string);
+        $uniqueLines = array_unique($lines);
+
+        return implode("\n", $uniqueLines);
+    }
+
+    private static function normalizeNewLines($content)
+    {
+        // Split the content by lines
+        $lines = explode("\n", $content);
+
+        $processedLines = [];
+
+        for ($i = 0; $i < count($lines); $i++) {
+            $line = trim($lines[$i]);
+
+            // If the line is an image markdown
+            if (preg_match("/^!\[.*\]\(.*\)$/", $line)) {
+                // And if the next line is not empty and not another markdown structure
+                if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) {
+                    $line .= ' '.trim($lines[$i + 1]);
+                    $i++; // Skip the next line as we're merging it
+                }
+            }
+
+            // Add line to processedLines if it's not empty
+            if (! empty($line)) {
+                $processedLines[] = $line;
+            }
+        }
+
+        // Collapse excessive newlines
+        $result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines));
+
+        // Detect and replace the pattern
+        $result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result);
+
+        // Replace multiple spaces with a dash separator
+        $result = preg_replace('/ {2,}/', ' - ', $result);
+
+        return $result;
+    }
+
+    private static function cleanHtml($htmlContent)
+    {
+        $crawler = new Crawler($htmlContent);
+
+        // Define tags to remove completely
+        $tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside'];
+
+        foreach ($tagsToRemove as $tag) {
+            $crawler->filter($tag)->each(function ($node) {
+                foreach ($node as $child) {
+                    $child->parentNode->removeChild($child);
+                }
+            });
+        }
+
+        // Replace <span> tags with their inner content
+        $crawler->filter('span')->each(function ($node) {
+            $replacement = new \DOMText($node->text());
+
+            foreach ($node as $child) {
+                $child->parentNode->replaceChild($replacement, $child);
+            }
+        });
+
+        return $crawler->outerHtml();
+    }
+}