from re import split, sub, match from regex import sub as sub2 def processRawMessages(chat: str) -> dict[str, list[str]]: output: dict[str, list[str]] = {} # Split based on new line segments = split(r"\n", chat) author = "" for segment in segments: re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment) if re_match: # It's a match, get rid of date and time, keep name + message author = re_match.group(1) if author not in output: output[author] = [] output[author].append(re_match.group(2)) else: # Not a match, check if it's an action or continuation of sentence re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment) if re_match2: # It's an action, ignore pass else: segmentList = output.get(author) if segmentList: segmentList[-1] += segment else: print("ERROR functions.py line 24") print(segment) return output def processMessageList(messages: list[str]) -> list[str]: output: list[str] = [] for message in messages: # Remove http(s) links message = sub(r"https?://(?:www\.)?\S+", "", message) # Remove emojis and symbols message = sub2(r"[\p{Emoji}?!:,.]+", "", message) # If it's not added media, add to output if message != "": output += message.lower().split() return output