refactoring

This commit is contained in:
2025-04-18 17:48:05 +02:00
parent d75536b137
commit d2d7733c35
3 changed files with 12 additions and 6 deletions

44
backend/functions.py Normal file
View File

@@ -0,0 +1,44 @@
from re import split, sub, match
from regex import sub as sub2
def processRawMessages(chat: str) -> dict[str, list[str]]:
output: dict[str, list[str]] = {}
# Split based on new line
segments = split(r"\n", chat)
author = ""
for segment in segments:
re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment)
if re_match:
# It's a match, get rid of date and time, keep name + message
author = re_match.group(1)
if author not in output:
output[author] = []
output[author].append(re_match.group(2))
else:
# Not a match, check if it's an action or continuation of sentence
re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment)
if re_match2:
# It's an action, ignore
pass
else:
segmentList = output.get(author)
if segmentList:
segmentList[-1] += segment
else:
print("ERROR functions.py line 24")
print(segment)
return output
def processMessageList(messages: list[str]) -> list[str]:
output: list[str] = []
for message in messages:
# Remove http(s) links
message = sub(r"https?://(?:www\.)?\S+", "", message)
# Remove emojis and symbols
message = sub2(r"[\p{Emoji}?!:,.]+", "", message)
# If it's not added media, add to output
if message != "<Media omitted>":
output += message.lower().split()
return output

28
backend/process_data.py Normal file
View File

@@ -0,0 +1,28 @@
from os import makedirs
from backend.functions import (
processRawMessages,
processMessageList,
)
from collections import Counter
# Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp
try:
file = open("data/_chat.txt", encoding="utf8")
chat = file.read()
file.close()
except FileNotFoundError:
print("Sorry, the file /data/_chat.txt does not exist.")
exit()
makedirs("output", exist_ok=True)
test = processRawMessages(chat)
frequency_dictionary: dict[str, dict[str, int]] = {}
for author in test.keys():
frequency_dictionary[author] = {}
messageList = test.get(author)
if messageList:
frequency_dictionary[author] = Counter(processMessageList(messageList))