mirror of
				https://github.com/StefBuwalda/whatsapp-wordcloud.git
				synced 2025-10-29 18:59:58 +00:00 
			
		
		
		
	Merge branch 'better_file_processing'
This commit is contained in:
		
							
								
								
									
										16
									
								
								config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								config.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| from wordcloud import WordCloud  # type: ignore | ||||
|  | ||||
| wordcloud = WordCloud( | ||||
|     width=1920, | ||||
|     height=1080, | ||||
|     background_color="black",  # or 'black', or any HTML color | ||||
|     colormap="viridis",  # matplotlib colormap ('plasma', 'cool', 'inferno') | ||||
|     # font_path="path/to/font.ttf",  # Use a custom font | ||||
|     max_words=100,  # Max number of words to include | ||||
|     min_font_size=10, | ||||
|     max_font_size=200, | ||||
|     prefer_horizontal=1,  # Between 0 (all vertical) and 1 (all horizontal) | ||||
|     scale=4,  # Higher = better resolution | ||||
|     contour_color="steelblue",  # Outline color (when using contour_width) | ||||
|     contour_width=1,  # For consistent layout between runs | ||||
| ) | ||||
							
								
								
									
										55
									
								
								functions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								functions.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,55 @@ | ||||
| from re import split, sub, match | ||||
| from regex import sub as sub2 | ||||
|  | ||||
|  | ||||
| def processRawMessages2(chat: str) -> dict[str, list[str]]: | ||||
|     output: dict[str, list[str]] = {} | ||||
|     # Split based on new line | ||||
|     segments = split(r"\n", chat) | ||||
|     author = "" | ||||
|     for segment in segments: | ||||
|         re_match = match(r"\d+/\d+/\d+, \d+:\d+ - ([^:]+): (.*)", segment) | ||||
|         if re_match: | ||||
|             # It's a match, get rid of date and time, keep name + message | ||||
|             author = re_match.group(1) | ||||
|             if author not in output: | ||||
|                 output[author] = [] | ||||
|             output[author].append(re_match.group(2)) | ||||
|         else: | ||||
|             # Not a match, check if it's an action or continuation of sentence | ||||
|             re_match2 = match(r"\d+/\d+/\d+, \d+:\d+ - ", segment) | ||||
|             if re_match2: | ||||
|                 # It's an action, ignore | ||||
|                 pass | ||||
|             else: | ||||
|                 segmentList = output.get(author) | ||||
|                 if segmentList: | ||||
|                     segmentList[-1] += segment | ||||
|                 else: | ||||
|                     print("ERROR functions.py line 24") | ||||
|                     print(segment) | ||||
|     return output | ||||
|  | ||||
|  | ||||
| def processMessageList(messages: list[str]) -> list[str]: | ||||
|     output: list[str] = [] | ||||
|     for message in messages: | ||||
|         # Remove http(s) links | ||||
|         message = sub(r"https?://(?:www\.)?\S+", "", message) | ||||
|         # Remove emojis and symbols | ||||
|         message = sub2(r"[\p{Emoji}?!:,.]+", "", message) | ||||
|         # If it's not added media, add to output | ||||
|         if message != "<Media omitted>": | ||||
|             output += message.lower().split() | ||||
|     return output | ||||
|  | ||||
|  | ||||
| def processRawMessages(chat: str): | ||||
|     temp = split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat) | ||||
|  | ||||
|     temp = [sub(r"([.,?!*()])", "", message) for message in temp] | ||||
|     temp = [sub(r"\n", " ", message) for message in temp] | ||||
|     temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp] | ||||
|     temp = [msg for msg in temp if msg != ""] | ||||
|  | ||||
|     return [s[3:] for s in temp if match(r" - [^ ]+?: ", s)] | ||||
							
								
								
									
										76
									
								
								run.py
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								run.py
									
									
									
									
									
								
							| @@ -1,15 +1,10 @@ | ||||
| from re import split, sub, match | ||||
| from wordcloud import WordCloud  # type: ignore | ||||
| from config import wordcloud  # type: ignore | ||||
| from os import makedirs | ||||
|  | ||||
|  | ||||
| def cleanupMessages(messages: list[str]) -> list[str]: | ||||
|     # Remove "", \n, and symbols like , and . | ||||
|     temp = [sub(r"([.,?!*()])", "", message) for message in messages] | ||||
|     temp = [sub(r"\n", " ", message) for message in temp] | ||||
|     temp = [sub(r"[^\x00-\x7F]", "", message) for message in temp] | ||||
|     temp = [msg for msg in temp if msg != ""] | ||||
|     return temp | ||||
| from functions import ( | ||||
|     processRawMessages, | ||||
|     processMessageList, | ||||
| ) | ||||
| from collections import Counter | ||||
|  | ||||
|  | ||||
| # Open and read the chats from the '/data/_chat.txt' file exported by Whatsapp | ||||
| @@ -21,57 +16,14 @@ except FileNotFoundError: | ||||
|     print("Sorry, the file /data/_chat.txt does not exist.") | ||||
|     exit() | ||||
|  | ||||
| messages = cleanupMessages( | ||||
|     split(r"\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}", chat) | ||||
| ) | ||||
|  | ||||
| messages = [s[3:] for s in messages if match(r" - [^ ]+?: ", s)] | ||||
|  | ||||
|  | ||||
| author_words: dict[str, list[str]] = {} | ||||
|  | ||||
| for message in messages: | ||||
|     message = sub(":", "", message) | ||||
|     author, words = split(r" ", message, maxsplit=1) | ||||
|     words = [word for word in words.split() if word and word != " "] | ||||
|     for word in words: | ||||
|         if author not in author_words: | ||||
|             author_words[author] = [] | ||||
|         author_words[author].append(word.lower()) | ||||
|  | ||||
| word_count_dicts: dict[str, int] = {} | ||||
|  | ||||
| wordcloud = WordCloud( | ||||
|     width=800, | ||||
|     height=400, | ||||
|     background_color="black",  # or 'black', or any HTML color | ||||
|     colormap="viridis",  # matplotlib colormap ('plasma', 'cool', 'inferno') | ||||
|     # font_path="path/to/font.ttf",  # Use a custom font | ||||
|     max_words=100,  # Max number of words to include | ||||
|     min_font_size=10, | ||||
|     max_font_size=100, | ||||
|     prefer_horizontal=0.9,  # Between 0 (all vertical) and 1 (all horizontal) | ||||
|     scale=2,  # Higher = better resolution | ||||
|     contour_color="steelblue",  # Outline color (when using contour_width) | ||||
|     contour_width=1,  # For consistent layout between runs | ||||
| ) | ||||
|  | ||||
| makedirs("output", exist_ok=True) | ||||
|  | ||||
| worddict: dict[str, int] = {} | ||||
| total = 0 | ||||
| for author in author_words.keys(): | ||||
|     words = author_words.get(author) | ||||
|     if words: | ||||
|         worddict[author] = len(words) | ||||
|         total += len(words) | ||||
|         for word in words: | ||||
|             word_count_dicts[word] = word_count_dicts.get(word, 0) + 1 | ||||
|     test = wordcloud.generate_from_frequencies(  # type: ignore | ||||
|         word_count_dicts | ||||
|     ) | ||||
|     test.to_file("output/" + author + ".png")  # type: ignore | ||||
| test = processRawMessages(chat) | ||||
|  | ||||
| for author in worddict.keys(): | ||||
|     count = worddict[author] | ||||
|     print(f"{author}: {count}/{total} ({round(count/total*100, ndigits=1)}%)") | ||||
| for author in test.keys(): | ||||
|     messageList = test.get(author) | ||||
|     if messageList: | ||||
|         wordList = processMessageList(messageList) | ||||
|         freq_dict = Counter(wordList) | ||||
|         image = wordcloud.generate_from_frequencies(freq_dict)  # type: ignore | ||||
|         image.to_file(f"output/{author}.png")  # type: ignore | ||||
|   | ||||
		Reference in New Issue
	
	Block a user