ChanWrapper to improve typing

2025-10-20 16:22:31 +02:00
parent 8f9617c5a8
commit 69e2ce651c
1 changed files with 71 additions and 102 deletions
--- a/src/app/api/social/chan.py
+++ b/src/app/api/social/chan.py
@@ -1,123 +1,92 @@
 '''
 Usiamo le API di 4chan per ottenere un catalogo di threads dalla board /biz/
 '''
-import requests
 import re
 import html
+import requests
 from bs4 import BeautifulSoup
+from app.api.core.social import *
+

-from .base import SocialWrapper, SocialPost, SocialComment
 class ChanWrapper(SocialWrapper):
    def __init__(self):
        super().__init__()

+    def __time_str(self, timestamp: str) -> str:
+        """Converte una stringa da MM/GG/AA di timestamp nel formato GG/MM/AA"""
+        if len(timestamp) < 8: return ""
+
+        month = timestamp[:2]
+        day = timestamp[3:5]
+        year = timestamp[6:8]
+        return f"{day}/{month}/{year}"
+
+    def __unformat_html_str(self, html_element: str) -> str:
+        """Pulisce il commento rimuovendo HTML e formattazioni inutili"""
+        if not html_element: return ""
+
+        html_entities = html.unescape(html_element)
+        soup = BeautifulSoup(html_entities, 'html.parser')
+        html_element = soup.get_text(separator=" ")
+        html_element = re.sub(r"[\\/]+", "/", html_element)
+        html_element = re.sub(r"\s+", " ", html_element).strip()
+        return html_element
+
    def get_top_crypto_posts(self, limit: int = 5) -> list[SocialPost]:
-        # Url dell'API della board /biz/
-        json_url = 'https://a.4cdn.org/biz/catalog.json'
-        json = requests.get(json_url)
+        url = 'https://a.4cdn.org/biz/catalog.json'
+        response = requests.get(url)
+        assert response.status_code == 200, f"Error in 4chan API request [{response.status_code}] {response.text}"

-        if json.status_code == 200:
-            page_list: list[dict] = json.json() # Questa lista contiene un dizionario per ogni pagina della board di questo tipo {"page": page_number, "threads": [{thread_data}]}
-        else:
-            print("Error:", json.status_code)
-
-        # Lista dei post
        social_posts: list[SocialPost] = []

-        for page in page_list:
-            thread_list: list[dict] = page['threads']
-            '''
-            Per ogni thread ci interessano i seguenti campi:
-            - "sticky": ci indica se il thread è stato fissato o meno, se non è presente vuol dire che non è stato fissato, i thread sticky possono essere ignorati
-            - "now": la data di creazione del thread tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
-            - "name": il nome dell'utente
-            - "sub": il nome del thread, può contenere anche elementi di formattazione html che saranno da ignorare, potrebbe non essere presente
-            - "com": il commento del thread, può contenere anche elementi di formattazione html che saranno da ignorare
-            - "last_replies": una lista di dizionari conteneti le risposte al thread principale, sono strutturate similarmente al thread, di queste ci interessano i seguenti campi:
-                - "now": la data di creazione della risposta tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
-                - "name": il nome dell'utente
-                - "com": il commento della risposta, possono contenere anche elementi di formattazione html che saranno da ignorare
-            '''
-            for thread in thread_list:
-                # Ignoriamo i dizionari dei thread nei quali è presente la key "sticky"
+        # Questa lista contiene un dizionario per ogni pagina della board di questo tipo {"page": page_number, "threads": [{thread_data}]}
+        for page in response.json():
+            for thread in page['threads']:
+
+                # ci indica se il thread è stato fissato o meno, se non è presente vuol dire che non è stato fissato, i thread sticky possono essere ignorati
                if 'sticky' in thread:
                    continue
-                else:
-                    time: str = thread['now']
-                    month: str = time[:2]
-                    day: str = time[4:6]
-                    year: str = time[7:9]
-                    time: str = day + '/' + month + '/' + year
-                    
-                    name: str = thread['name']
-                    try:
-                        title: str = thread['sub']
-                        html_entities = html.unescape(title)
-                        soup = BeautifulSoup(html_entities, 'html.parser')
-                        title = soup.get_text(separator=" ")
-                        title = re.sub(r"[\\/]+", "/", title)
-                        title = re.sub(r"\s+", " ", title).strip()
-                        title = name + " posted: " + title
-                    except:
-                        title: str = name + " posted"

-                    try: 
-                        thread_description: str = thread['com']
-                        html_entities = html.unescape(thread_description)
-                        soup = BeautifulSoup(html_entities, 'html.parser')
-                        thread_description = soup.get_text(separator=" ")
-                        thread_description = re.sub(r"[\\/]+", "/", thread_description)
-                        thread_description = re.sub(r"\s+", " ", thread_description).strip()
-                    except:
-                        thread_description = None
-                    try:
-                        response_list: list[dict] = thread['last_replies']
-                    except:
-                        response_list: list[dict] = []
-                    comments_list: list[SocialComment] = []
+                # la data di creazione del thread tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
+                time = self.__time_str(thread.get('now', ''))

-                    # Otteniamo i primi 5 commenti
-                    i = 0
-                    for response in response_list:
-                        time: str = response['now']
-                        month: str = time[:2]
-                        day: str = time[3:5]
-                        year: str = time[6:8]
-                        time: str = day + '/' + month + '/' + year
+                # il nome dell'utente
+                name: str = thread.get('name', 'Anonymous')

-                        try: 
-                            comment_description: str = response['com']
-                            html_entities = html.unescape(comment_description)
-                            soup = BeautifulSoup(html_entities, 'html.parser')
-                            comment_description = soup.get_text(separator=" ")
-                            comment_description = re.sub(r"[\\/]+", "/", comment_description)
-                            comment_description = re.sub(r"\s+", " ", comment_description).strip()
-                        except:
-                            comment_description = None
-                        if comment_description is None:
-                            continue
-                        else:
-                            social_comment: SocialComment = SocialComment(
-                                time=time,
-                                description=comment_description
-                            )
-                            comments_list.append(social_comment)
-                        i += 1
-                        if i >= 5:
-                            break
-                    if thread_description is None:
+                # il nome del thread, può contenere anche elementi di formattazione html che saranno da ignorare, potrebbe non essere presente
+                title = self.__unformat_html_str(thread.get('sub', ''))
+                title = f"{name} posted: {title}"
+
+                # il commento del thread, può contenere anche elementi di formattazione html che saranno da ignorare
+                thread_description = self.__unformat_html_str(thread.get('com', ''))
+                if not thread_description:
+                    continue
+
+                # una lista di dizionari conteneti le risposte al thread principale, sono strutturate similarmente al thread, di queste ci interessano i seguenti campi:
+                # - "now": la data di creazione della risposta tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
+                # - "name": il nome dell'utente
+                # - "com": il commento della risposta, possono contenere anche elementi di formattazione html che saranno da ignorare
+                response_list = thread.get('last_replies', [])
+                comments_list: list[SocialComment] = []
+
+                for i, response in enumerate(response_list):
+                    if i >= MAX_COMMENTS: break
+
+                    time = self.__time_str(response['now'])
+
+                    comment = self.__unformat_html_str(response.get('com', ''))
+                    if not comment:
                        continue
-                    else:
-                        social_post: SocialPost = SocialPost(
-                            time=time,
-                            title=title,
-                            description=thread_description,
-                            comments=comments_list
-                        )
-                        social_posts.append(social_post)
-        
-        return social_posts[:limit]           
-# Stampiamo i post
-# chan_wrapper = ChanWrapper()
-# social_posts = chan_wrapper.get_top_crypto_posts()
-# print(len(social_posts))
+
+                    social_comment = SocialComment(time=time, description=comment)
+                    comments_list.append(social_comment)
+
+                social_post: SocialPost = SocialPost(
+                    time=time,
+                    title=title,
+                    description=thread_description,
+                    comments=comments_list
+                )
+                social_posts.append(social_post)
+
+        return social_posts[:limit]