mirror of
https://github.com/StefBuwalda/whatsapp-wordcloud.git
synced 2025-10-30 11:19:57 +00:00
refactoring
This commit is contained in:
44
backend/functions.py
Normal file
44
backend/functions.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from re import split, sub, match
|
||||
from regex import sub as sub2
|
||||
|
||||
|
||||
def processRawMessages(chat: str) -> dict[str, list[str]]:
|
||||
output: dict[str, list[str]] = {}
|
||||
# Split based on new line
|
||||
segments = split(r"\n", chat)
|
||||
author = ""
|
||||
for segment in segments:
|
||||
re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment)
|
||||
if re_match:
|
||||
# It's a match, get rid of date and time, keep name + message
|
||||
author = re_match.group(1)
|
||||
if author not in output:
|
||||
output[author] = []
|
||||
output[author].append(re_match.group(2))
|
||||
else:
|
||||
# Not a match, check if it's an action or continuation of sentence
|
||||
re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment)
|
||||
if re_match2:
|
||||
# It's an action, ignore
|
||||
pass
|
||||
else:
|
||||
segmentList = output.get(author)
|
||||
if segmentList:
|
||||
segmentList[-1] += segment
|
||||
else:
|
||||
print("ERROR functions.py line 24")
|
||||
print(segment)
|
||||
return output
|
||||
|
||||
|
||||
def processMessageList(messages: list[str]) -> list[str]:
|
||||
output: list[str] = []
|
||||
for message in messages:
|
||||
# Remove http(s) links
|
||||
message = sub(r"https?://(?:www\.)?\S+", "", message)
|
||||
# Remove emojis and symbols
|
||||
message = sub2(r"[\p{Emoji}?!:,.]+", "", message)
|
||||
# If it's not added media, add to output
|
||||
if message != "<Media omitted>":
|
||||
output += message.lower().split()
|
||||
return output
|
||||
28
backend/process_data.py
Normal file
28
backend/process_data.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from os import makedirs
|
||||
from backend.functions import (
|
||||
processRawMessages,
|
||||
processMessageList,
|
||||
)
|
||||
from collections import Counter
|
||||
|
||||
|
||||
# Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
|
||||
try:
|
||||
file = open("data/_chat.txt", encoding="utf8")
|
||||
chat = file.read()
|
||||
file.close()
|
||||
except FileNotFoundError:
|
||||
print("Sorry, the file /data/_chat.txt does not exist.")
|
||||
exit()
|
||||
|
||||
makedirs("output", exist_ok=True)
|
||||
|
||||
test = processRawMessages(chat)
|
||||
|
||||
frequency_dictionary: dict[str, dict[str, int]] = {}
|
||||
|
||||
for author in test.keys():
|
||||
frequency_dictionary[author] = {}
|
||||
messageList = test.get(author)
|
||||
if messageList:
|
||||
frequency_dictionary[author] = Counter(processMessageList(messageList))
|
||||
Reference in New Issue
Block a user