mirror of
https://github.com/StefBuwalda/whatsapp-wordcloud.git
synced 2025-11-01 12:19:57 +00:00
removed old functionality and added new
This commit is contained in:
16
config.py
Normal file
16
config.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from wordcloud import WordCloud # type: ignore
|
||||||
|
|
||||||
|
wordcloud = WordCloud(
|
||||||
|
width=800,
|
||||||
|
height=400,
|
||||||
|
background_color="black", # or 'black', or any HTML color
|
||||||
|
colormap="viridis", # matplotlib colormap ('plasma', 'cool', 'inferno')
|
||||||
|
# font_path="path/to/font.ttf", # Use a custom font
|
||||||
|
max_words=100, # Max number of words to include
|
||||||
|
min_font_size=10,
|
||||||
|
max_font_size=100,
|
||||||
|
prefer_horizontal=1, # Between 0 (all vertical) and 1 (all horizontal)
|
||||||
|
scale=2, # Higher = better resolution
|
||||||
|
contour_color="steelblue", # Outline color (when using contour_width)
|
||||||
|
contour_width=1, # For consistent layout between runs
|
||||||
|
)
|
||||||
43
functions.py
43
functions.py
@@ -1,4 +1,47 @@
|
|||||||
from re import split, sub, match
|
from re import split, sub, match
|
||||||
|
from regex import sub as sub2
|
||||||
|
|
||||||
|
|
||||||
|
def processRawMessages2(chat: str) -> dict[str, list[str]]:
|
||||||
|
output: dict[str, list[str]] = {}
|
||||||
|
# Split based on new line
|
||||||
|
segments = split(r"\n", chat)
|
||||||
|
author = ""
|
||||||
|
for segment in segments:
|
||||||
|
re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment)
|
||||||
|
if re_match:
|
||||||
|
# It's a match, get rid of date and time, keep name + message
|
||||||
|
author = re_match.group(1)
|
||||||
|
if author not in output:
|
||||||
|
output[author] = []
|
||||||
|
output[author].append(re_match.group(2))
|
||||||
|
else:
|
||||||
|
# Not a match, check if it's an action or continuation of sentence
|
||||||
|
re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment)
|
||||||
|
if re_match2:
|
||||||
|
# It's an action, ignore
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
segmentList = output.get(author)
|
||||||
|
if segmentList:
|
||||||
|
segmentList[-1] += segment
|
||||||
|
else:
|
||||||
|
print("ERROR functions.py line 24")
|
||||||
|
print(segment)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def processMessageList(messages: list[str]) -> list[str]:
|
||||||
|
output: list[str] = []
|
||||||
|
for message in messages:
|
||||||
|
# Remove http(s) links
|
||||||
|
message = sub(r"https?://(?:www\.)?\S+", "", message)
|
||||||
|
# Remove emojis and symbols
|
||||||
|
message = sub2(r"[\p{Emoji}?!:,.]+", "", message)
|
||||||
|
# If it's not added media, add to output
|
||||||
|
if message != "<Media omitted>":
|
||||||
|
output += message.lower().split()
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def processRawMessages(chat: str):
|
def processRawMessages(chat: str):
|
||||||
|
|||||||
59
run.py
59
run.py
@@ -1,7 +1,11 @@
|
|||||||
from re import split, sub
|
from config import wordcloud # type: ignore
|
||||||
from wordcloud import WordCloud # type: ignore
|
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from functions import processRawMessages
|
from functions import (
|
||||||
|
processRawMessages,
|
||||||
|
processRawMessages2,
|
||||||
|
processMessageList,
|
||||||
|
)
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
# Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
|
# Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
|
||||||
@@ -13,44 +17,15 @@ except FileNotFoundError:
|
|||||||
print("Sorry, the file /data/_chat.txt does not exist.")
|
print("Sorry, the file /data/_chat.txt does not exist.")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
messages = processRawMessages(chat)
|
|
||||||
|
|
||||||
author_words: dict[str, list[str]] = {}
|
|
||||||
|
|
||||||
for message in messages:
|
|
||||||
message = sub(":", "", message)
|
|
||||||
author, words = split(r" ", message, maxsplit=1)
|
|
||||||
words = [word for word in words.split() if word and word != " "]
|
|
||||||
for word in words:
|
|
||||||
if author not in author_words:
|
|
||||||
author_words[author] = []
|
|
||||||
author_words[author].append(word.lower())
|
|
||||||
|
|
||||||
word_count_dicts: dict[str, int] = {}
|
|
||||||
|
|
||||||
wordcloud = WordCloud(
|
|
||||||
width=800,
|
|
||||||
height=400,
|
|
||||||
background_color="black", # or 'black', or any HTML color
|
|
||||||
colormap="viridis", # matplotlib colormap ('plasma', 'cool', 'inferno')
|
|
||||||
# font_path="path/to/font.ttf", # Use a custom font
|
|
||||||
max_words=100, # Max number of words to include
|
|
||||||
min_font_size=10,
|
|
||||||
max_font_size=100,
|
|
||||||
prefer_horizontal=0.9, # Between 0 (all vertical) and 1 (all horizontal)
|
|
||||||
scale=2, # Higher = better resolution
|
|
||||||
contour_color="steelblue", # Outline color (when using contour_width)
|
|
||||||
contour_width=1, # For consistent layout between runs
|
|
||||||
)
|
|
||||||
|
|
||||||
makedirs("output", exist_ok=True)
|
makedirs("output", exist_ok=True)
|
||||||
|
|
||||||
for author in author_words.keys():
|
test = processRawMessages2(chat)
|
||||||
words = author_words.get(author)
|
|
||||||
if words:
|
for author in test.keys():
|
||||||
for word in words:
|
messageList = test.get(author)
|
||||||
word_count_dicts[word] = word_count_dicts.get(word, 0) + 1
|
if messageList:
|
||||||
test = wordcloud.generate_from_frequencies( # type: ignore
|
wordList = processMessageList(messageList)
|
||||||
word_count_dicts
|
freq_dict = Counter(wordList)
|
||||||
)
|
image = wordcloud.generate_from_frequencies(freq_dict) # type: ignore
|
||||||
test.to_file("output/" + author + ".png") # type: ignore
|
image.to_file(f"output/{author}.png") # type: ignore
|
||||||
|
messages = processRawMessages(chat)
|
||||||
|
|||||||
Reference in New Issue
Block a user