14 socials integration (#34)

* Create XWrapper.py & ChanWrapper.py * Tests for in XWrapper & ChanWrapper * MAX_COMMENTS in social.py * Soddisfatto Giacomo * unified_timestamp
2025-10-20 16:56:11 +02:00
parent 3adf7ed250
commit 06c660b659
13 changed files with 242 additions and 27 deletions
--- a/src/app/api/social/init.py
+++ b/src/app/api/social/init.py
@@ -1,3 +1,5 @@
 from app.api.social.reddit import RedditWrapper
+from app.api.social.x import XWrapper
+from app.api.social.chan import ChanWrapper

-__all__ = ["RedditWrapper"]
+__all__ = ["RedditWrapper", "XWrapper", "ChanWrapper"]
--- a/src/app/api/social/chan.py
+++ b/src/app/api/social/chan.py
@@ -0,0 +1,89 @@
+'''
+Usiamo le API di 4chan per ottenere un catalogo di threads dalla board /biz/
+'''
+import re
+import html
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+from app.api.core.social import *
+
+
+class ChanWrapper(SocialWrapper):
+    def __init__(self):
+        super().__init__()
+
+    def __time_str(self, timestamp: str) -> int:
+        """Converte una stringa da MM/GG/AA(DAY)HH:MM:SS di 4chan a millisecondi"""
+        time = datetime.strptime(timestamp, "%m/%d/%y(%a)%H:%M:%S")
+        return int(time.timestamp() * 1000)
+
+    def __unformat_html_str(self, html_element: str) -> str:
+        """Pulisce il commento rimuovendo HTML e formattazioni inutili"""
+        if not html_element: return ""
+
+        html_entities = html.unescape(html_element)
+        soup = BeautifulSoup(html_entities, 'html.parser')
+        html_element = soup.get_text(separator=" ")
+        html_element = re.sub(r"[\\/]+", "/", html_element)
+        html_element = re.sub(r"\s+", " ", html_element).strip()
+        return html_element
+
+    def get_top_crypto_posts(self, limit: int = 5) -> list[SocialPost]:
+        url = 'https://a.4cdn.org/biz/catalog.json'
+        response = requests.get(url)
+        assert response.status_code == 200, f"Error in 4chan API request [{response.status_code}] {response.text}"
+
+        social_posts: list[SocialPost] = []
+
+        # Questa lista contiene un dizionario per ogni pagina della board di questo tipo {"page": page_number, "threads": [{thread_data}]}
+        for page in response.json():
+            for thread in page['threads']:
+
+                # ci indica se il thread è stato fissato o meno, se non è presente vuol dire che non è stato fissato, i thread sticky possono essere ignorati
+                if 'sticky' in thread:
+                    continue
+
+                # la data di creazione del thread tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
+                time = self.__time_str(thread.get('now', ''))
+
+                # il nome dell'utente
+                name: str = thread.get('name', 'Anonymous')
+
+                # il nome del thread, può contenere anche elementi di formattazione html che saranno da ignorare, potrebbe non essere presente
+                title = self.__unformat_html_str(thread.get('sub', ''))
+                title = f"{name} posted: {title}"
+
+                # il commento del thread, può contenere anche elementi di formattazione html che saranno da ignorare
+                thread_description = self.__unformat_html_str(thread.get('com', ''))
+                if not thread_description:
+                    continue
+
+                # una lista di dizionari conteneti le risposte al thread principale, sono strutturate similarmente al thread
+                response_list = thread.get('last_replies', [])
+                comments_list: list[SocialComment] = []
+
+                for i, response in enumerate(response_list):
+                    if i >= MAX_COMMENTS: break
+
+                    # la data di creazione della risposta tipo "MM/GG/AA(day)hh:mm:ss", ci interessa solo MM/GG/AA
+                    time = self.__time_str(response['now'])
+
+                    # il commento della risposta, può contenere anche elementi di formattazione html che saranno da ignorare
+                    comment = self.__unformat_html_str(response.get('com', ''))
+                    if not comment:
+                        continue
+
+                    social_comment = SocialComment(description=comment)
+                    social_comment.set_timestamp(timestamp_ms=time)
+                    comments_list.append(social_comment)
+
+                social_post: SocialPost = SocialPost(
+                    title=title,
+                    description=thread_description,
+                    comments=comments_list
+                )
+                social_post.set_timestamp(timestamp_ms=time)
+                social_posts.append(social_post)
+
+        return social_posts[:limit]
--- a/src/app/api/social/reddit.py
+++ b/src/app/api/social/reddit.py
@@ -1,10 +1,9 @@
 import os
 from praw import Reddit # type: ignore
 from praw.models import Submission # type: ignore
-from app.api.core.social import SocialWrapper, SocialPost, SocialComment
+from app.api.core.social import *


-MAX_COMMENTS = 5
 # metterne altri se necessario.
 # fonti: https://lkiconsulting.io/marketing/best-crypto-subreddits/
 SUBREDDITS = [
@@ -24,13 +23,13 @@ SUBREDDITS = [

 def extract_post(post: Submission) -> SocialPost:
    social = SocialPost()
-    social.time = str(post.created)
+    social.set_timestamp(timestamp_ms=post.created)
    social.title = post.title
    social.description = post.selftext

    for top_comment in post.comments:
        comment = SocialComment()
-        comment.time = str(top_comment.created)
+        comment.set_timestamp(timestamp_ms=top_comment.created)
        comment.description = top_comment.body
        social.comments.append(comment)

--- a/src/app/api/social/x.py
+++ b/src/app/api/social/x.py
@@ -0,0 +1,46 @@
+import os
+import json
+import subprocess
+from shutil import which
+from app.api.core.social import SocialWrapper, SocialPost
+
+
+# This is the list of users that can be interesting
+# To get the ID of a new user is necessary to search it on X, copy the url and insert it in a service like "https://get-id-x.foundtt.com/en/"
+X_USERS = [
+    'watcherguru',
+    'Cointelegraph',
+    'BTC_Archive',
+    'elonmusk'
+]
+
+class XWrapper(SocialWrapper):
+    def __init__(self):
+        '''
+        This wrapper uses the rettiwt API to get data from X in order to avoid the rate limits of the free X API,
+        even if improbable this could lead to a ban so do not use the personal account,
+        In order to work it is necessary to install the rettiwt cli tool, for more information visit the official documentation at https://www.npmjs.com/package/rettiwt-api
+        '''
+
+        self.api_key = os.getenv("X_API_KEY")
+        assert self.api_key, "X_API_KEY environment variable not set"
+        assert which('rettiwt') is not None, "Command `rettiwt` not installed"
+
+
+    def get_top_crypto_posts(self, limit:int = 5) -> list[SocialPost]:
+        social_posts: list[SocialPost] = []
+
+        for user in X_USERS:
+            process = subprocess.run(f"rettiwt -k {self.api_key} tweet search -f {str(user)}", capture_output=True)
+            results = process.stdout.decode()
+            json_result = json.loads(results)
+
+            tweets = json_result['list']
+            for tweet in tweets[:limit]:
+                social_post = SocialPost()
+                social_post.time = tweet['createdAt']
+                social_post.title = str(user) + " tweeted: "
+                social_post.description = tweet['fullText']
+                social_posts.append(social_post)
+
+        return social_posts