ChanWrapper to improve typing

2025-10-20 16:22:31 +02:00
parent 8f9617c5a8
commit 69e2ce651c
1 changed files with 71 additions and 102 deletions
--- a/src/app/api/social/chan.py
+++ b/src/app/api/social/chan.py
@@ -1,123 +1,92 @@
 '''
 Usiamo le API di 4chan per ottenere un catalogo di threads dalla board /biz/
 '''
 import requests
 import re
 import html
 import requests
 from bs4 import BeautifulSoup
 from app.api.core.social import *
 from .base import SocialWrapper, SocialPost, SocialComment
 class ChanWrapper(SocialWrapper):
    def __init__(self):
        super().__init__()
    def __time_str(self, timestamp: str) -> str:
        """Converte una stringa da MM/GG/AA di timestamp nel formato GG/MM/AA"""
        if len(timestamp) < 8: return ""
        month = timestamp[:2]
        day = timestamp[3:5]
        year = timestamp[6:8]
        return f"{day}/{month}/{year}"
    def __unformat_html_str(self, html_element: str) -> str:
        """Pulisce il commento rimuovendo HTML e formattazioni inutili"""
        if not html_element: return ""
        html_entities = html.unescape(html_element)
        soup = BeautifulSoup(html_entities, 'html.parser')
        html_element = soup.get_text(separator=" ")
        html_element = re.sub(r"[\\/]+", "/", html_element)
        html_element = re.sub(r"\s+", " ", html_element).strip()
        return html_element
    def get_top_crypto_posts(self, limit: int = 5) -> list[SocialPost]:
-        # Url dell'API della board /biz/
+        url = 'https://a.4cdn.org/biz/catalog.json'
-        json_url = 'https://a.4cdn.org/biz/catalog.json'
+        response = requests.get(url)
-        json = requests.get(json_url)
+        assert response.status_code == 200, f"Error in 4chan API request [{response.status_code}] {response.text}"
        if json.status_code == 200:
            page_list: list[dict] = json.json() # Questa lista contiene un dizionario per ogni pagina della board di questo tipo {"page": page_number, "threads": [{thread_data}]}
        else:
            print("Error:", json.status_code)
        # Lista dei post
        social_posts: list[SocialPost] = []
-        for page in page_list:
+        # Questa lista contiene un dizionario per ogni pagina della board di questo tipo {"page": page_number, "threads": [{thread_data}]}
-            thread_list: list[dict] = page['threads']
+        for page in response.json():
-            '''
+            for thread in page['threads']:
-            Per ogni thread ci interessano i seguenti campi:
+
-            - "sticky": ci indica se il thread è stato fissato o meno, se non è presente vuol dire che non è stato fissato, i thread sticky possono essere ignorati
+                # ci indica se il thread è stato fissato o meno, se non è presente vuol dire che non è stato fissato, i thread sticky possono essere ignorati
            - "now": la data di creazione del thread tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
            - "name": il nome dell'utente
            - "sub": il nome del thread, può contenere anche elementi di formattazione html che saranno da ignorare, potrebbe non essere presente
            - "com": il commento del thread, può contenere anche elementi di formattazione html che saranno da ignorare
            - "last_replies": una lista di dizionari conteneti le risposte al thread principale, sono strutturate similarmente al thread, di queste ci interessano i seguenti campi:
                - "now": la data di creazione della risposta tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
                - "name": il nome dell'utente
                - "com": il commento della risposta, possono contenere anche elementi di formattazione html che saranno da ignorare
            '''
            for thread in thread_list:
                # Ignoriamo i dizionari dei thread nei quali è presente la key "sticky"
                if 'sticky' in thread:
                    continue
                else:
                    time: str = thread['now']
                    month: str = time[:2]
                    day: str = time[4:6]
                    year: str = time[7:9]
                    time: str = day + '/' + month + '/' + year
-                    name: str = thread['name']
+                # la data di creazione del thread tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
-                    try:
+                time = self.__time_str(thread.get('now', ''))
                        title: str = thread['sub']
                        html_entities = html.unescape(title)
                        soup = BeautifulSoup(html_entities, 'html.parser')
                        title = soup.get_text(separator=" ")
                        title = re.sub(r"[\\/]+", "/", title)
                        title = re.sub(r"\s+", " ", title).strip()
                        title = name + " posted: " + title
                    except:
                        title: str = name + " posted"
-                    try: 
+                # il nome dell'utente
-                        thread_description: str = thread['com']
+                name: str = thread.get('name', 'Anonymous')
                        html_entities = html.unescape(thread_description)
                        soup = BeautifulSoup(html_entities, 'html.parser')
                        thread_description = soup.get_text(separator=" ")
                        thread_description = re.sub(r"[\\/]+", "/", thread_description)
                        thread_description = re.sub(r"\s+", " ", thread_description).strip()
                    except:
                        thread_description = None
                    try:
                        response_list: list[dict] = thread['last_replies']
                    except:
                        response_list: list[dict] = []
                    comments_list: list[SocialComment] = []
-                    # Otteniamo i primi 5 commenti
+                # il nome del thread, può contenere anche elementi di formattazione html che saranno da ignorare, potrebbe non essere presente
-                    i = 0
+                title = self.__unformat_html_str(thread.get('sub', ''))
-                    for response in response_list:
+                title = f"{name} posted: {title}"
                        time: str = response['now']
                        month: str = time[:2]
                        day: str = time[3:5]
                        year: str = time[6:8]
                        time: str = day + '/' + month + '/' + year
-                        try: 
+                # il commento del thread, può contenere anche elementi di formattazione html che saranno da ignorare
-                            comment_description: str = response['com']
+                thread_description = self.__unformat_html_str(thread.get('com', ''))
-                            html_entities = html.unescape(comment_description)
+                if not thread_description:
-                            soup = BeautifulSoup(html_entities, 'html.parser')
+                    continue
-                            comment_description = soup.get_text(separator=" ")
+
-                            comment_description = re.sub(r"[\\/]+", "/", comment_description)
+                # una lista di dizionari conteneti le risposte al thread principale, sono strutturate similarmente al thread, di queste ci interessano i seguenti campi:
-                            comment_description = re.sub(r"\s+", " ", comment_description).strip()
+                # - "now": la data di creazione della risposta tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
-                        except:
+                # - "name": il nome dell'utente
-                            comment_description = None
+                # - "com": il commento della risposta, possono contenere anche elementi di formattazione html che saranno da ignorare
-                        if comment_description is None:
+                response_list = thread.get('last_replies', [])
-                            continue
+                comments_list: list[SocialComment] = []
-                        else:
+
-                            social_comment: SocialComment = SocialComment(
+                for i, response in enumerate(response_list):
-                                time=time,
+                    if i >= MAX_COMMENTS: break
-                                description=comment_description
+
-                            )
+                    time = self.__time_str(response['now'])
-                            comments_list.append(social_comment)
+
-                        i += 1
+                    comment = self.__unformat_html_str(response.get('com', ''))
-                        if i >= 5:
+                    if not comment:
                            break
                    if thread_description is None:
                        continue
-                    else:
+
-                        social_post: SocialPost = SocialPost(
+                    social_comment = SocialComment(time=time, description=comment)
-                            time=time,
+                    comments_list.append(social_comment)
-                            title=title,
+
-                            description=thread_description,
+                social_post: SocialPost = SocialPost(
-                            comments=comments_list
+                    time=time,
-                        )
+                    title=title,
-                        social_posts.append(social_post)
+                    description=thread_description,
                    comments=comments_list
                )
                social_posts.append(social_post)
        return social_posts[:limit]
 # Stampiamo i post
 # chan_wrapper = ChanWrapper()
 # social_posts = chan_wrapper.get_top_crypto_posts()
 # print(len(social_posts))