Add (ai gen)

2023-10-01 04:17:49 +08:00
parent 5fcfa75d97
commit 5b4a02778e
7 changed files with 191 additions and 84 deletions
--- a/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php
+++ b/app/Jobs/Tasks/GenerateShopeeAIArticleTask.php
@@ -17,6 +17,7 @@
 use LaravelFreelancerNL\LaravelIndexNow\Facades\IndexNow;
 use LaravelGoogleIndexing;
 use Masterminds\HTML5;
+use Symfony\Component\DomCrawler\Crawler;

 class GenerateShopeeAIArticleTask
 {
@@ -48,6 +49,8 @@ public static function handle(ShopeeSellerScrape $shopee_seller_scrape)
        if (is_null($ai_writeup)) {
            $ai_output = OpenAI::writeProductArticle($excerpt, $photos);

+            //dd($ai_output);
+
            if (is_null($ai_output)) {
                $e = new Exception('Failed to write: Missing ai_output');

@@ -140,41 +143,58 @@ private static function getTotalServiceCost($shopee_task)

    private static function stripHtml(string $raw_html)
    {
-        $r_configuration = new ReadabilityConfiguration();
-        $r_configuration->setWordThreshold(20);

-        $readability = new Readability($r_configuration);
+        $html_content = '';

-        // try {
-        //     $readability->parse($raw_html);
+        try {

-        //     $html_content = $readability->getContent();
+            $r_configuration = new ReadabilityConfiguration();
+            $r_configuration->setWordThreshold(20);

-        //     // Remove tabs
-        //     $html_content = str_replace("\t", '', $html_content);
+            $readability = new Readability($r_configuration);

-        //     // Replace newlines with spaces
-        //     $html_content = str_replace(["\n", "\r\n"], ' ', $html_content);
+            $readability->parse($raw_html);

-        //     // Replace multiple spaces with a single space
-        //     $html_content = preg_replace('/\s+/', ' ', $html_content);
+            $temp_html_content = $readability->getContent();

-        //     // Output the cleaned text
-        //     $html_content = trim($html_content); // Using trim to remove any leading or trailing spaces
+            // Remove tabs
+            $temp_html_content = str_replace("\t", '', $temp_html_content);

-        //     $html_content = strip_tags($html_content);
+            // Replace newlines with spaces
+            $temp_html_content = str_replace(["\n", "\r\n"], ' ', $temp_html_content);

-        // } catch (ReadabilityParseException|Exception $e) {
+            // Replace multiple spaces with a single space
+            $temp_html_content = preg_replace('/\s+/', ' ', $temp_html_content);

-        $html5 = new HTML5(['preserveWhiteSpace' => true]);
+            // Output the cleaned text
+            $temp_html_content = trim($temp_html_content); // Using trim to remove any leading or trailing spaces

-        // Parse the HTML into a DOM tree.
-        $dom = $html5->loadHTML($raw_html);
+            $temp_html_content = strip_tags($temp_html_content);

-        // Serialize the DOM tree back to a string, formatted.
-        $html_content = strip_tags($html5->saveHTML($dom));
+            $crawler = new Crawler($raw_html);

-        // }
+            // Extract meta title
+            $title = $crawler->filter('title')->text();  // This assumes <title> tags are used for titles.
+
+            // Extract meta description
+            $metaDescriptionNode = $crawler->filter('meta[name="description"]');
+            $description = $metaDescriptionNode->count() > 0 ? $metaDescriptionNode->attr('content') : null;
+
+            $html_content .= $title.' ';
+            $html_content .= $description.' ';
+            $html_content .= $temp_html_content;
+
+        } catch (ReadabilityParseException|Exception $e) {
+
+            $html5 = new HTML5(['preserveWhiteSpace' => true]);
+
+            // Parse the HTML into a DOM tree.
+            $dom = $html5->loadHTML($raw_html);
+
+            // Serialize the DOM tree back to a string, formatted.
+            $html_content = strip_tags($html5->saveHTML($dom));
+
+        }

        return $html_content;
    }