status, ['blocked', 'trashed'])) { return; } $enable_proxy = false; $url_to_crawl->is_crawling = true; $url_to_crawl->save(); $url_to_crawl->refresh(); // try { $user_agent = config('platform.proxy.user_agent'); $response = Http::withHeaders([ 'User-Agent' => $user_agent, ]) ->withOptions([ 'proxy' => ($enable_proxy) ? get_smartproxy_rotating_server() : null, 'timeout' => 10, 'verify' => false, ]) ->get($url_to_crawl->url); if ($response->successful()) { $raw_html = $response->body(); if ($enable_proxy) { $cost = calculate_smartproxy_cost(round(strlen($raw_html) / 1024, 2), 'rotating_global'); $service_cost_usage = new ServiceCostUsage; $service_cost_usage->cost = $cost; $service_cost_usage->name = 'smartproxy-GetUrlBodyTask'; $service_cost_usage->reference_1 = 'url_to_crawl'; $service_cost_usage->reference_2 = strval($url_to_crawl_id); $service_cost_usage->output = self::getMarkdownFromHtml($raw_html); $service_cost_usage->save(); } } else { $raw_html = null; $response->throw(); } // } catch (Exception $e) { // $raw_html = null; // //throw $e; // } $markdown_output = self::getMarkdownFromHtml($raw_html); if (! is_empty($markdown_output)) { $url_to_crawl->output_type = 'markdown'; $url_to_crawl->output = $markdown_output; } else { $url_to_crawl->output = 'EMPTY CONTENT'; $url_to_crawl->status = 'blocked'; } $url_to_crawl->is_crawled = true; $url_to_crawl->crawl_counts = $url_to_crawl->crawl_counts + 1; if ($url_to_crawl->save()) { if (! in_array($url_to_crawl->status, ['blocked', 'trashed'])) { ParseUrlBodyJob::dispatch($url_to_crawl->id)->onQueue('default')->onConnection('default'); } } } private static function getMainImageFromHtml($html) { $r_configuration = new ReadabilityConfiguration(); $r_configuration->setCharThreshold(20); $readability = new Readability($r_configuration); try { $readability->parse($html); return $readability->getImage(); //dd($readability); } catch (ReadabilityParseException $e) { } return null; } private static function getMarkdownFromHtml($html) { $converter = new HtmlConverter([ 'strip_tags' => true, 'strip_placeholder_links' => true, ]); $html = self::cleanHtml($html); $markdown = $converter->convert($html); //dd($markdown); $markdown = self::reverseLTGT($markdown); $markdown = self::normalizeNewLines($markdown); $markdown = self::removeDuplicateLines($markdown); return html_entity_decode(markdown_to_plaintext($markdown)); } private static function reverseLTGT($input) { $output = str_replace('<', '<', $input); $output = str_replace('>', '>', $output); return $output; } private static function removeDuplicateLines($string) { $lines = explode("\n", $string); $uniqueLines = array_unique($lines); return implode("\n", $uniqueLines); } private static function normalizeNewLines($content) { // Split the content by lines $lines = explode("\n", $content); $processedLines = []; for ($i = 0; $i < count($lines); $i++) { $line = trim($lines[$i]); // If the line is an image markdown if (preg_match("/^!\[.*\]\(.*\)$/", $line)) { // And if the next line is not empty and not another markdown structure if (isset($lines[$i + 1]) && ! empty(trim($lines[$i + 1])) && ! preg_match('/^[-=#*&_]+$/', trim($lines[$i + 1]))) { $line .= ' '.trim($lines[$i + 1]); $i++; // Skip the next line as we're merging it } } // Add line to processedLines if it's not empty if (! empty($line)) { $processedLines[] = $line; } } // Collapse excessive newlines $result = preg_replace("/\n{3,}/", "\n\n", implode("\n", $processedLines)); // Detect and replace the pattern $result = preg_replace('/^(!\[.*?\]\(.*?\))\s*\n\s*([^\n!]+)/m', '$1 $2', $result); // Replace multiple spaces with a dash separator $result = preg_replace('/ {2,}/', ' - ', $result); return $result; } private static function cleanHtml($htmlContent) { $crawler = new Crawler($htmlContent); // Define tags to remove completely $tagsToRemove = ['script', 'style', 'svg', 'picture', 'form', 'footer', 'nav', 'aside']; foreach ($tagsToRemove as $tag) { $crawler->filter($tag)->each(function ($node) { foreach ($node as $child) { $child->parentNode->removeChild($child); } }); } // Replace tags with their inner content $crawler->filter('span')->each(function ($node) { $replacement = new \DOMText($node->text()); foreach ($node as $child) { $child->parentNode->replaceChild($replacement, $child); } }); return $crawler->outerHtml(); } }