Python 3 Chinese word cloud production

Two Python libraries needed in this article
jieba: a Chinese word segmentation tool
Wordcloud: a word cloud generation tool in Python

The source of the article is 309 articles from Han Han's Sina blog
Python 3 crawler Han Han Sina blog post
Python + worldcloud + jieba learn to generate word cloud with any Chinese text in ten minutes

This time, no logical rules have been formulated to eliminate some high-frequency words with codes. It is considered to add a word frequency sorting step before generating word cloud, and list the times that exceed a certain number of times in the filter table. What this code uses for reference is to manually add the word list of inactive words to eliminate high-frequency words.

from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
# jieba.load_userdict("txt\userdict.txt")
# Add the user lexicon as the main dictionary, and the original dictionary becomes the non main dictionary
from wordcloud import WordCloud, ImageColorGenerator

# Get current file path
# __File is the current file. If you run this line in the ide, an error will be reported. You can change it to
# d = path.dirname('.')
d = path.dirname('.')

stopwords = {}
isCN = 1 #Chinese word segmentation is enabled by default
back_coloring_path = "123.jpg" # Set background picture path
text_path = 'test.txt' #Set the text path to analyze
font_path = 'simkai.ttf' # Set Chinese font path for matplotlib
stopwords_path = 'stopword.txt' # stop list 
imgname1 = "WordCloudDefautColors.png" # Saved picture name 1 (only according to the shape of background picture)
imgname2 = "WordCloudColorsByImg.png"# Saved picture name 2 (the color is generated according to the background picture color layout)

my_words_list = ['Han Han'] # Add new words to a stuttering Thesaurus

back_coloring = imread(path.join(d, back_coloring_path))# Set background picture

# Set word cloud properties
wc = WordCloud(font_path=font_path,  # Set font
               background_color="white",  # background color
               max_words=2000,  # Maximum number of words displayed in word cloud
               mask=back_coloring,  # Set background picture
               max_font_size=100,  # Font maximum
               random_state=42,
               width=1000, height=860, margin=2,# Set the default size of the picture, but if the background picture is used, the saved picture size will be saved according to its size, and margin is the word edge distance
               )

# Add your own thesaurus segmentation
def add_word(list):
    for items in list:
        jieba.add_word(items)

add_word(my_words_list)

text = open(path.join(d, text_path)).read()

def jiebaclearText(text):
    mywordlist = []
    seg_list = jieba.cut(text, cut_all=False)
    liststr="/ ".join(seg_list)
    f_stop = open(stopwords_path)
    try:
        f_stop_text = f_stop.read( )
        #f_stop_text=f_stop_text.decode('utf-8')
    finally:
        f_stop.close( )
    f_stop_seg_list=f_stop_text.split('\n')
    for myword in liststr.split('/'):
        if not(myword.strip() in f_stop_seg_list) and len(myword.strip())>1:
            mywordlist.append(myword)
    return ''.join(mywordlist)

if isCN:
    text = jiebaclearText(text)

# To generate a word cloud, you can use generate to input all the text (wordcloud does not support Chinese word segmentation well, it is recommended to enable Chinese word segmentation), or we can use generate from frequency function after calculating the word frequency
wc.generate(text)
# wc.generate_from_frequencies(txt_freq)
# For example, [('word a', 100), ('word b', 90), ('word C ', 80]]
# Generate color values from background pictures
image_colors = ImageColorGenerator(back_coloring)

plt.figure()
# The following code shows the picture
plt.imshow(wc)
plt.axis("off")
plt.show()
# Draw word clouds

# Save pictures
wc.to_file(path.join(d, imgname1))

image_colors = ImageColorGenerator(back_coloring)

plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
# Draw a picture with the background as the color
plt.figure()
plt.imshow(back_coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
# Save pictures
wc.to_file(path.join(d, imgname2))

Tags: Python

Posted on Sun, 05 Apr 2020 07:34:06 -0400 by billiondevil