From c553167a24c66c9ab849f87eedaea3ce8e332bfb Mon Sep 17 00:00:00 2001 From: Stef Date: Fri, 18 Apr 2025 14:22:42 +0200 Subject: [PATCH 1/3] Seperated part of the message processing into seperate functions --- functions.py | 12 ++++++++++++ run.py | 19 +++---------------- 2 files changed, 15 insertions(+), 16 deletions(-) create mode 100644 functions.py diff --git a/functions.py b/functions.py new file mode 100644 index 0000000..6a4715b --- /dev/null +++ b/functions.py @@ -0,0 +1,12 @@ +from re import split, sub, match + + +def processRawMessages(chat: str): + temp = split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat) + + temp = [sub(r"([.,?!*()])", "", message) for message in temp] + temp = [sub(r"\n", " ", message) for message in temp] + temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp] + temp = [msg for msg in temp if msg != ""] + + return [s[3:] for s in temp if match(r" - [^ ]+?: ", s)] diff --git a/run.py b/run.py index 5ede2da..69ce22b 100644 --- a/run.py +++ b/run.py @@ -1,15 +1,7 @@ -from re import split, sub, match +from re import split, sub from wordcloud import WordCloud # type: ignore from os import makedirs - - -def cleanupMessages(messages: list[str]) -> list[str]: - # Remove "", \n, and symbols like , and . - temp = [sub(r"([.,?!*()])", "", message) for message in messages] - temp = [sub(r"\n", " ", message) for message in temp] - temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp] - temp = [msg for msg in temp if msg != ""] - return temp +from functions import processRawMessages # Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp @@ -21,12 +13,7 @@ except FileNotFoundError: print("Sorry, the file /data/_chat.txt does not exist.") exit() -messages = cleanupMessages( - split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat) -) - -messages = [s[3:] for s in messages if match(r" - [^ ]+?: ", s)] - +messages = processRawMessages(chat) author_words: dict[str, list[str]] = {} From 8b648639f46208293a862c85ef3071e556286b42 Mon Sep 17 00:00:00 2001 From: Stef Date: Fri, 18 Apr 2025 17:04:29 +0200 Subject: [PATCH 2/3] removed old functionality and added new --- config.py | 16 ++++++++++++++ functions.py | 43 ++++++++++++++++++++++++++++++++++++++ run.py | 59 +++++++++++++++------------------------------------- 3 files changed, 76 insertions(+), 42 deletions(-) create mode 100644 config.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..cdb52a7 --- /dev/null +++ b/config.py @@ -0,0 +1,16 @@ +from wordcloud import WordCloud # type: ignore + +wordcloud = WordCloud( + width=800, + height=400, + background_color="black", # or 'black', or any HTML color + colormap="viridis", # matplotlib colormap ('plasma', 'cool', 'inferno') + # font_path="path/to/font.ttf", # Use a custom font + max_words=100, # Max number of words to include + min_font_size=10, + max_font_size=100, + prefer_horizontal=1, # Between 0 (all vertical) and 1 (all horizontal) + scale=2, # Higher = better resolution + contour_color="steelblue", # Outline color (when using contour_width) + contour_width=1, # For consistent layout between runs +) diff --git a/functions.py b/functions.py index 6a4715b..bde83d1 100644 --- a/functions.py +++ b/functions.py @@ -1,4 +1,47 @@ from re import split, sub, match +from regex import sub as sub2 + + +def processRawMessages2(chat: str) -> dict[str, list[str]]: + output: dict[str, list[str]] = {} + # Split based on new line + segments = split(r"\n", chat) + author = "" + for segment in segments: + re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment) + if re_match: + # It's a match, get rid of date and time, keep name + message + author = re_match.group(1) + if author not in output: + output[author] = [] + output[author].append(re_match.group(2)) + else: + # Not a match, check if it's an action or continuation of sentence + re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment) + if re_match2: + # It's an action, ignore + pass + else: + segmentList = output.get(author) + if segmentList: + segmentList[-1] += segment + else: + print("ERROR functions.py line 24") + print(segment) + return output + + +def processMessageList(messages: list[str]) -> list[str]: + output: list[str] = [] + for message in messages: + # Remove http(s) links + message = sub(r"https?://(?:www\.)?\S+", "", message) + # Remove emojis and symbols + message = sub2(r"[\p{Emoji}?!:,.]+", "", message) + # If it's not added media, add to output + if message != "": + output += message.lower().split() + return output def processRawMessages(chat: str): diff --git a/run.py b/run.py index 69ce22b..e1915f0 100644 --- a/run.py +++ b/run.py @@ -1,7 +1,11 @@ -from re import split, sub -from wordcloud import WordCloud # type: ignore +from config import wordcloud # type: ignore from os import makedirs -from functions import processRawMessages +from functions import ( + processRawMessages, + processRawMessages2, + processMessageList, +) +from collections import Counter # Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp @@ -13,44 +17,15 @@ except FileNotFoundError: print("Sorry, the file /data/_chat.txt does not exist.") exit() -messages = processRawMessages(chat) - -author_words: dict[str, list[str]] = {} - -for message in messages: - message = sub(":", "", message) - author, words = split(r" ", message, maxsplit=1) - words = [word for word in words.split() if word and word != " "] - for word in words: - if author not in author_words: - author_words[author] = [] - author_words[author].append(word.lower()) - -word_count_dicts: dict[str, int] = {} - -wordcloud = WordCloud( - width=800, - height=400, - background_color="black", # or 'black', or any HTML color - colormap="viridis", # matplotlib colormap ('plasma', 'cool', 'inferno') - # font_path="path/to/font.ttf", # Use a custom font - max_words=100, # Max number of words to include - min_font_size=10, - max_font_size=100, - prefer_horizontal=0.9, # Between 0 (all vertical) and 1 (all horizontal) - scale=2, # Higher = better resolution - contour_color="steelblue", # Outline color (when using contour_width) - contour_width=1, # For consistent layout between runs -) - makedirs("output", exist_ok=True) -for author in author_words.keys(): - words = author_words.get(author) - if words: - for word in words: - word_count_dicts[word] = word_count_dicts.get(word, 0) + 1 - test = wordcloud.generate_from_frequencies( # type: ignore - word_count_dicts - ) - test.to_file("output/" + author + ".png") # type: ignore +test = processRawMessages2(chat) + +for author in test.keys(): + messageList = test.get(author) + if messageList: + wordList = processMessageList(messageList) + freq_dict = Counter(wordList) + image = wordcloud.generate_from_frequencies(freq_dict) # type: ignore + image.to_file(f"output/{author}.png") # type: ignore +messages = processRawMessages(chat) From c4a0208a5377b9ddb1914847ac842f4da6d3699b Mon Sep 17 00:00:00 2001 From: Stef Date: Fri, 18 Apr 2025 17:08:37 +0200 Subject: [PATCH 3/3] Update config.py --- config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config.py b/config.py index cdb52a7..6811eb2 100644 --- a/config.py +++ b/config.py @@ -1,16 +1,16 @@ from wordcloud import WordCloud # type: ignore wordcloud = WordCloud( - width=800, - height=400, + width=1920, + height=1080, background_color="black", # or 'black', or any HTML color colormap="viridis", # matplotlib colormap ('plasma', 'cool', 'inferno') # font_path="path/to/font.ttf", # Use a custom font max_words=100, # Max number of words to include min_font_size=10, - max_font_size=100, + max_font_size=200, prefer_horizontal=1, # Between 0 (all vertical) and 1 (all horizontal) - scale=2, # Higher = better resolution + scale=4, # Higher = better resolution contour_color="steelblue", # Outline color (when using contour_width) contour_width=1, # For consistent layout between runs )