Merge branch 'better_file_processing'

2026-02-04 04:04:49 +00:00 · 2025-04-18 17:12:47 +02:00
parent 9cf6f679b3 c4a0208a53
commit 1c7f27de88
3 changed files with 85 additions and 62 deletions
--- a/config.py
+++ b/config.py
@@ -0,0 +1,16 @@
+from wordcloud import WordCloud  # type: ignore
+
+wordcloud = WordCloud(
+    width=1920,
+    height=1080,
+    background_color="black",  # or 'black', or any HTML color
+    colormap="viridis",  # matplotlib colormap ('plasma', 'cool', 'inferno')
+    # font_path="path/to/font.ttf",  # Use a custom font
+    max_words=100,  # Max number of words to include
+    min_font_size=10,
+    max_font_size=200,
+    prefer_horizontal=1,  # Between 0 (all vertical) and 1 (all horizontal)
+    scale=4,  # Higher = better resolution
+    contour_color="steelblue",  # Outline color (when using contour_width)
+    contour_width=1,  # For consistent layout between runs
+)
--- a/functions.py
+++ b/functions.py
@@ -0,0 +1,55 @@
+from re import split, sub, match
+from regex import sub as sub2
+
+
+def processRawMessages2(chat: str) -> dict[str, list[str]]:
+    output: dict[str, list[str]] = {}
+    # Split based on new line
+    segments = split(r"\n", chat)
+    author = ""
+    for segment in segments:
+        re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment)
+        if re_match:
+            # It's a match, get rid of date and time, keep name + message
+            author = re_match.group(1)
+            if author not in output:
+                output[author] = []
+            output[author].append(re_match.group(2))
+        else:
+            # Not a match, check if it's an action or continuation of sentence
+            re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment)
+            if re_match2:
+                # It's an action, ignore
+                pass
+            else:
+                segmentList = output.get(author)
+                if segmentList:
+                    segmentList[-1] += segment
+                else:
+                    print("ERROR functions.py line 24")
+                    print(segment)
+    return output
+
+
+def processMessageList(messages: list[str]) -> list[str]:
+    output: list[str] = []
+    for message in messages:
+        # Remove http(s) links
+        message = sub(r"https?://(?:www\.)?\S+", "", message)
+        # Remove emojis and symbols
+        message = sub2(r"[\p{Emoji}?!:,.]+", "", message)
+        # If it's not added media, add to output
+        if message != "<Media omitted>":
+            output += message.lower().split()
+    return output
+
+
+def processRawMessages(chat: str):
+    temp = split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat)
+
+    temp = [sub(r"([.,?!*()])", "", message) for message in temp]
+    temp = [sub(r"\n", " ", message) for message in temp]
+    temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp]
+    temp = [msg for msg in temp if msg != ""]
+
+    return [s[3:] for s in temp if match(r" - [^ ]+?: ", s)]
--- a/run.py
+++ b/run.py
@@ -1,15 +1,10 @@
-from re import split, sub, match
-from wordcloud import WordCloud  # type: ignore
+from config import wordcloud  # type: ignore
 from os import makedirs
-
-
-def cleanupMessages(messages: list[str]) -> list[str]:
-    # Remove "", \n, and symbols like , and .
-    temp = [sub(r"([.,?!*()])", "", message) for message in messages]
-    temp = [sub(r"\n", " ", message) for message in temp]
-    temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp]
-    temp = [msg for msg in temp if msg != ""]
-    return temp
+from functions import (
+    processRawMessages,
+    processMessageList,
+)
+from collections import Counter


 # Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
@@ -21,57 +16,14 @@ except FileNotFoundError:
    print("Sorry, the file /data/_chat.txt does not exist.")
    exit()

-messages = cleanupMessages(
-    split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat)
-)
-
-messages = [s[3:] for s in messages if match(r" - [^ ]+?: ", s)]
-
-
-author_words: dict[str, list[str]] = {}
-
-for message in messages:
-    message = sub(":", "", message)
-    author, words = split(r" ", message, maxsplit=1)
-    words = [word for word in words.split() if word and word != " "]
-    for word in words:
-        if author not in author_words:
-            author_words[author] = []
-        author_words[author].append(word.lower())
-
-word_count_dicts: dict[str, int] = {}
-
-wordcloud = WordCloud(
-    width=800,
-    height=400,
-    background_color="black",  # or 'black', or any HTML color
-    colormap="viridis",  # matplotlib colormap ('plasma', 'cool', 'inferno')
-    # font_path="path/to/font.ttf",  # Use a custom font
-    max_words=100,  # Max number of words to include
-    min_font_size=10,
-    max_font_size=100,
-    prefer_horizontal=0.9,  # Between 0 (all vertical) and 1 (all horizontal)
-    scale=2,  # Higher = better resolution
-    contour_color="steelblue",  # Outline color (when using contour_width)
-    contour_width=1,  # For consistent layout between runs
-)
-
 makedirs("output", exist_ok=True)

-worddict: dict[str, int] = {}
-total = 0
-for author in author_words.keys():
-    words = author_words.get(author)
-    if words:
-        worddict[author] = len(words)
-        total += len(words)
-        for word in words:
-            word_count_dicts[word] = word_count_dicts.get(word, 0) + 1
-    test = wordcloud.generate_from_frequencies(  # type: ignore
-        word_count_dicts
-    )
-    test.to_file("output/" + author + ".png")  # type: ignore
+test = processRawMessages(chat)

-for author in worddict.keys():
-    count = worddict[author]
-    print(f"{author}: {count}/{total} ({round(count/total*100, ndigits=1)}%)")
+for author in test.keys():
+    messageList = test.get(author)
+    if messageList:
+        wordList = processMessageList(messageList)
+        freq_dict = Counter(wordList)
+        image = wordcloud.generate_from_frequencies(freq_dict)  # type: ignore
+        image.to_file(f"output/{author}.png")  # type: ignore