From 8b648639f46208293a862c85ef3071e556286b42 Mon Sep 17 00:00:00 2001
From: Stef <stbuwalda@gmail.com>
Date: Fri, 18 Apr 2025 17:04:29 +0200
Subject: [PATCH] removed old functionality and added new

---
 config.py    | 16 ++++++++++++++
 functions.py | 43 ++++++++++++++++++++++++++++++++++++++
 run.py       | 59 +++++++++++++++-------------------------------------
 3 files changed, 76 insertions(+), 42 deletions(-)
 create mode 100644 config.py
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..cdb52a7
--- /dev/null
+++ b/config.py
@@ -0,0 +1,16 @@
+from wordcloud import WordCloud  # type: ignore
+
+wordcloud = WordCloud(
+    width=800,
+    height=400,
+    background_color="black",  # or 'black', or any HTML color
+    colormap="viridis",  # matplotlib colormap ('plasma', 'cool', 'inferno')
+    # font_path="path/to/font.ttf",  # Use a custom font
+    max_words=100,  # Max number of words to include
+    min_font_size=10,
+    max_font_size=100,
+    prefer_horizontal=1,  # Between 0 (all vertical) and 1 (all horizontal)
+    scale=2,  # Higher = better resolution
+    contour_color="steelblue",  # Outline color (when using contour_width)
+    contour_width=1,  # For consistent layout between runs
+)
diff --git a/functions.py b/functions.py
index 6a4715b..bde83d1 100644
--- a/functions.py
+++ b/functions.py
@@ -1,4 +1,47 @@
 from re import split, sub, match
+from regex import sub as sub2
+
+
+def processRawMessages2(chat: str) -> dict[str, list[str]]:
+    output: dict[str, list[str]] = {}
+    # Split based on new line
+    segments = split(r"\n", chat)
+    author = ""
+    for segment in segments:
+        re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment)
+        if re_match:
+            # It's a match, get rid of date and time, keep name + message
+            author = re_match.group(1)
+            if author not in output:
+                output[author] = []
+            output[author].append(re_match.group(2))
+        else:
+            # Not a match, check if it's an action or continuation of sentence
+            re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment)
+            if re_match2:
+                # It's an action, ignore
+                pass
+            else:
+                segmentList = output.get(author)
+                if segmentList:
+                    segmentList[-1] += segment
+                else:
+                    print("ERROR functions.py line 24")
+                    print(segment)
+    return output
+
+
+def processMessageList(messages: list[str]) -> list[str]:
+    output: list[str] = []
+    for message in messages:
+        # Remove http(s) links
+        message = sub(r"https?://(?:www\.)?\S+", "", message)
+        # Remove emojis and symbols
+        message = sub2(r"[\p{Emoji}?!:,.]+", "", message)
+        # If it's not added media, add to output
+        if message != "<Media omitted>":
+            output += message.lower().split()
+    return output
 
 
 def processRawMessages(chat: str):
diff --git a/run.py b/run.py
index 69ce22b..e1915f0 100644
--- a/run.py
+++ b/run.py
@@ -1,7 +1,11 @@
-from re import split, sub
-from wordcloud import WordCloud  # type: ignore
+from config import wordcloud  # type: ignore
 from os import makedirs
-from functions import processRawMessages
+from functions import (
+    processRawMessages,
+    processRawMessages2,
+    processMessageList,
+)
+from collections import Counter
 
 
 # Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
@@ -13,44 +17,15 @@ except FileNotFoundError:
     print("Sorry, the file /data/_chat.txt does not exist.")
     exit()
 
-messages = processRawMessages(chat)
-
-author_words: dict[str, list[str]] = {}
-
-for message in messages:
-    message = sub(":", "", message)
-    author, words = split(r" ", message, maxsplit=1)
-    words = [word for word in words.split() if word and word != " "]
-    for word in words:
-        if author not in author_words:
-            author_words[author] = []
-        author_words[author].append(word.lower())
-
-word_count_dicts: dict[str, int] = {}
-
-wordcloud = WordCloud(
-    width=800,
-    height=400,
-    background_color="black",  # or 'black', or any HTML color
-    colormap="viridis",  # matplotlib colormap ('plasma', 'cool', 'inferno')
-    # font_path="path/to/font.ttf",  # Use a custom font
-    max_words=100,  # Max number of words to include
-    min_font_size=10,
-    max_font_size=100,
-    prefer_horizontal=0.9,  # Between 0 (all vertical) and 1 (all horizontal)
-    scale=2,  # Higher = better resolution
-    contour_color="steelblue",  # Outline color (when using contour_width)
-    contour_width=1,  # For consistent layout between runs
-)
-
 makedirs("output", exist_ok=True)
 
-for author in author_words.keys():
-    words = author_words.get(author)
-    if words:
-        for word in words:
-            word_count_dicts[word] = word_count_dicts.get(word, 0) + 1
-    test = wordcloud.generate_from_frequencies(  # type: ignore
-        word_count_dicts
-    )
-    test.to_file("output/" + author + ".png")  # type: ignore
+test = processRawMessages2(chat)
+
+for author in test.keys():
+    messageList = test.get(author)
+    if messageList:
+        wordList = processMessageList(messageList)
+        freq_dict = Counter(wordList)
+        image = wordcloud.generate_from_frequencies(freq_dict)  # type: ignore
+        image.to_file(f"output/{author}.png")  # type: ignore
+messages = processRawMessages(chat)