mirror of
https://github.com/StefBuwalda/whatsapp-wordcloud.git
synced 2025-10-29 18:59:58 +00:00
Created script to turn whatsapp data into a wordmap for each author
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,3 +1,7 @@
|
|||||||
|
/data
|
||||||
|
/processing
|
||||||
|
/output
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|||||||
69
run.py
Normal file
69
run.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
from re import split, sub, match
|
||||||
|
from wordcloud import WordCloud
|
||||||
|
from os import makedirs
|
||||||
|
|
||||||
|
|
||||||
|
def cleanupMessages(messages: list[str]) -> list[str]:
|
||||||
|
# Remove "", \n, and symbols like , and .
|
||||||
|
temp = [sub(r"([.,?!*()])", "", message) for message in messages]
|
||||||
|
temp = [sub(r"\n", " ", message) for message in temp]
|
||||||
|
temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp]
|
||||||
|
temp = [msg for msg in temp if msg != ""]
|
||||||
|
return temp
|
||||||
|
|
||||||
|
|
||||||
|
# Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
|
||||||
|
try:
|
||||||
|
file = open("data/_chat.txt", encoding="utf8")
|
||||||
|
chat = file.read()
|
||||||
|
file.close()
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Sorry, the file /data/_chat.txt does not exist.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
messages = cleanupMessages(
|
||||||
|
split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat)
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [s[3:] for s in messages if match(r" - [^ ]+?: ", s)]
|
||||||
|
|
||||||
|
|
||||||
|
author_words: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for message in messages:
|
||||||
|
message = sub(":", "", message)
|
||||||
|
author, words = split(r" ", message, maxsplit=1)
|
||||||
|
words = [word for word in words.split() if word and word != " "]
|
||||||
|
for word in words:
|
||||||
|
if author not in author_words:
|
||||||
|
author_words[author] = []
|
||||||
|
author_words[author].append(word.lower())
|
||||||
|
|
||||||
|
word_count_dicts: dict[str, int] = {}
|
||||||
|
|
||||||
|
wordcloud = WordCloud(
|
||||||
|
width=800,
|
||||||
|
height=400,
|
||||||
|
background_color="black", # or 'black', or any HTML color
|
||||||
|
colormap="viridis", # matplotlib colormap ('plasma', 'cool', 'inferno')
|
||||||
|
# font_path="path/to/font.ttf", # Use a custom font
|
||||||
|
max_words=100, # Max number of words to include
|
||||||
|
min_font_size=10,
|
||||||
|
max_font_size=100,
|
||||||
|
prefer_horizontal=0.9, # Between 0 (all vertical) and 1 (all horizontal)
|
||||||
|
scale=2, # Higher = better resolution
|
||||||
|
contour_color="steelblue", # Outline color (when using contour_width)
|
||||||
|
contour_width=1, # For consistent layout between runs
|
||||||
|
)
|
||||||
|
|
||||||
|
makedirs("output", exist_ok=True)
|
||||||
|
|
||||||
|
for author in author_words.keys():
|
||||||
|
words = author_words.get(author)
|
||||||
|
if words:
|
||||||
|
for word in words:
|
||||||
|
word_count_dicts[word] = word_count_dicts.get(word, 0) + 1
|
||||||
|
test = wordcloud.generate_from_frequencies( # type: ignore
|
||||||
|
word_count_dicts
|
||||||
|
)
|
||||||
|
test.to_file("output/" + author + ".png") # type: ignore
|
||||||
Reference in New Issue
Block a user