NLTK基础语料库、分词、词性标注一、NLTK概述1.1 NLTK简介importnltkimportnumpyasnpimportmatplotlib.pyplotaspltfrommatplotlib.patchesimportRectangle,FancyBboxPatchimportwarnings warnings.filterwarnings(ignore)print(*60)print(NLTK自然语言工具包Natural Language Toolkit)print(*60)# NLTK功能概览fig,axplt.subplots(figsize(12,8))ax.axis(off)# 中心centerplt.Circle((0.5,0.5),0.12,colorlightcoral,ecblack)ax.add_patch(center)ax.text(0.5,0.5,NLTK,hacenter,vacenter,fontsize12,fontweightbold)# 功能模块modules{语料库\n(Corpus):(0.15,0.75),分词\n(Tokenize):(0.85,0.75),词性标注\n(POS Tag):(0.15,0.25),命名实体\n(NER):(0.85,0.25),词干提取\n(Stem):(0.5,0.85),语法分析\n(Parsing):(0.5,0.15),}formodule,(x,y)inmodules.items():circleplt.Circle((x,y),0.09,colorlightblue,ecblack)ax.add_patch(circle)ax.text(x,y,module,hacenter,vacenter,fontsize7)# 连接到中心ax.annotate(,xy(x,y),xytext(0.5,0.5),arrowpropsdict(arrowstyle-,colorgray,lw1,alpha0.5))ax.set_xlim(0,1)ax.set_ylim(0,1)ax.set_title(NLTK功能模块,fontsize14)plt.tight_layout()plt.show()print(\n NLTK特点:)print( - 教育资源丰富适合教学)print( - 包含大量内置语料库)print( - 算法实现完整易于理解)print( - 学术研究常用工具)# 下载必要数据print(\n 下载NLTK数据...)try:nltk.data.find(tokenizers/punkt)print(✓ 数据已存在)exceptLookupError:nltk.download(punkt)nltk.download(averaged_perceptron_tagger)nltk.download(maxent_ne_chunker)nltk.download(words)nltk.download(stopwords)nltk.download(wordnet)nltk.download(brown)nltk.download(gutenberg)print(✓ 数据下载完成)二、语料库Corpus2.1 内置语料库defcorpora_demo():NLTK内置语料库print(\n*60)print(NLTK内置语料库)print(*60)code from nltk.corpus import gutenberg, brown, reuters, stopwords, wordnet import nltk # 1. 古腾堡语料库文学作品 print( 古腾堡语料库 (Gutenberg):) print(f 文件列表: {gutenberg.fileids()}) print(f 莎士比亚《哈姆雷特》词数: {len(gutenberg.words(shakespeare-hamlet.txt))}) print(f 《圣经》前10个词: {gutenberg.words(bible-kjv.txt)[:10]}) # 2. Brown语料库分类文本 print(\\n Brown语料库:) print(f 文本类别: {brown.categories()}) print(f 总词数: {len(brown.words())}) print(f 新闻类文本: {brown.words(categoriesnews)[:10]}) # 3. Reuters语料库新闻 print(\\n Reuters语料库:) print(f 文件数量: {len(reuters.fileids())}) print(f 类别数量: {len(reuters.categories())}) print(f 第一篇新闻标题: {reuters.words(reuters.fileids()[0])[:15]}) # 4. 停用词表 stop_words set(stopwords.words(english)) print(f\\n 停用词表 (Stopwords):) print(f 停用词数量: {len(stop_words)}) print(f 前20个停用词: {list(stop_words)[:20]}) # 5. WordNet词汇数据库 print(\\n WordNet:) print(f 同义词集数量: {len(list(wordnet.all_synsets()))}) print(f 名词数量: {len(list(wordnet.all_synsets(n)))}) print(f 动词数量: {len(list(wordnet.all_synsets(v)))}) print(code)corpora_demo()2.2 语料库操作defcorpus_operations():语料库操作print(\n*60)print(语料库基本操作)print(*60)code from nltk.corpus import gutenberg, brown from nltk.probability import FreqDist import matplotlib.pyplot as plt # 1. 语料库统计 print( 语料库统计:) print(f Brown语料库词数: {len(brown.words()):,}) print(f Brown语料库句子数: {len(brown.sents()):,}) # 2. 词频统计 hamlet_words gutenberg.words(shakespeare-hamlet.txt) fdist FreqDist(hamlet_words) print(\\n 词频统计 (哈姆雷特):) print(f 词汇表大小: {len(fdist)}) print(f 最常见10个词: {fdist.most_common(10)}) # 3. 词长分布 word_lengths [len(w) for w in hamlet_words if w.isalpha()] print(f\\n 词长分布:) print(f 平均词长: {sum(word_lengths)/len(word_lengths):.2f}) print(f 最长词: {max(hamlet_words, keylen)}) # 4. 不同类别的词汇差异 news_words brown.words(categoriesnews) fiction_words brown.words(categoriesfiction) news_vocab set(news_words) fiction_vocab set(fiction_words) print(f\\n 类别词汇对比:) print(f 新闻类词汇量: {len(news_vocab)}) print(f 小说类词汇量: {len(fiction_vocab)}) print(f 共有词汇: {len(news_vocab fiction_vocab)}) print(f 新闻特有: {len(news_vocab - fiction_vocab)}) print(f 小说特有: {len(fiction_vocab - news_vocab)}) # 5. 可视化词频 # 绘制词频分布图 top_words fdist.most_common(20) words, freqs zip(*top_words) plt.figure(figsize(12, 6)) plt.bar(words, freqs) plt.xlabel(单词) plt.ylabel(频数) plt.title(哈姆雷特词频分布Top 20) plt.xticks(rotation45) plt.tight_layout() plt.show() print(code)corpus_operations()2.3 自定义语料库defcustom_corpus_nltk():自定义语料库print(\n*60)print(自定义语料库)print(*60)code from nltk.corpus import PlaintextCorpusReader import os # 创建自定义语料库目录 corpus_root ./my_corpus os.makedirs(corpus_root, exist_okTrue) # 创建示例文件 doc1_content \\\ Natural Language Processing (NLP) is a subfield of artificial intelligence. It focuses on the interaction between computers and human language. NLP applications include machine translation, sentiment analysis, and chatbots. \\\ doc2_content \\\ Machine Learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data. \\\ with open(f{corpus_root}/nlp.txt, w) as f: f.write(doc1_content) with open(f{corpus_root}/ml.txt, w) as f: f.write(doc2_content) # 加载自定义语料库 reader PlaintextCorpusReader(corpus_root, .*\.txt) print( 自定义语料库:) print(f 文件列表: {reader.fileids()}) print(f 总词数: {len(reader.words())}) print(f 总句子数: {len(reader.sents())}) # 统计分析 print(\\n 文件统计:) for fileid in reader.fileids(): words reader.words(fileid) sents reader.sents(fileid) print(f {fileid}: {len(words)}词, {len(sents)}句) # 词频统计 all_words reader.words() fdist FreqDist(all_words) print(f\\n 整体词频 (Top 10):) for word, freq in fdist.most_common(10): print(f {word}: {freq}) # 保存处理后的语料 with open(f{corpus_root}/processed.txt, w) as f: for sent in reader.sents(): f.write( .join(sent) \\n) print(f\\n✓ 处理后语料已保存) print(code)custom_corpus_nltk()三、分词Tokenization3.1 句子分词与单词分词deftokenization_nltk():NLTK分词print(\n*60)print(分词Tokenization)print(*60)code import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer from nltk.tokenize import WhitespaceTokenizer, TweetTokenizer # 示例文本 text Hello world! This is NLTK. Its a great library for NLP. Dr. Smith said: NLP is amazing! # 1. 句子分词 print( 句子分词 (sent_tokenize):) sentences sent_tokenize(text) for i, sent in enumerate(sentences, 1): print(f 句子{i}: {sent}) # 2. 单词分词 print(\\n 单词分词 (word_tokenize):) words word_tokenize(text) print(f 结果: {words}) # 3. 标点分词 print(\\n 标点分词 (WordPunctTokenizer):) tokenizer WordPunctTokenizer() punct_tokens tokenizer.tokenize(text) print(f 结果: {punct_tokens}) # 4. 正则表达式分词 print(\\n 正则分词 (RegexpTokenizer):) # 只保留字母和数字 tokenizer RegexpTokenizer(r\\w) regex_tokens tokenizer.tokenize(text) print(f 结果: {regex_tokens}) # 5. 空白分词 print(\\n 空白分词 (WhitespaceTokenizer):) tokenizer WhitespaceTokenizer() ws_tokens tokenizer.tokenize(text) print(f 结果: {ws_tokens}) # 6. 推特分词处理表情、话题等 print(\\n 推特分词 (TweetTokenizer):) tweet I love Python! #NLP user https://example.com tokenizer TweetTokenizer() tweet_tokens tokenizer.tokenize(tweet) print(f 结果: {tweet_tokens}) # 7. 不同语言的分词 print(\\n 中文分词示例:) chinese_text 自然语言处理是人工智能的重要分支广泛应用于机器翻译、情感分析等领域。 chinese_tokens word_tokenize(chinese_text) print(f 中文分词结果: {chinese_tokens}) print(code)tokenization_nltk()3.2 分词实战deftokenization_practice():分词实战print(\n*60)print(分词实战)print(*60)code from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.probability import FreqDist import string # 加载文本 text \\\ Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language. It is used to analyze text, allowing machines to understand how humans speak. NLP is used in many real-world applications like: - Machine Translation (Google Translate) - Sentiment Analysis (Social Media Monitoring) - Chatbots and Virtual Assistants (Siri, Alexa) - Text Summarization - Speech Recognition \\\ # 1. 基础分词 print( 基础分词:) sentences sent_tokenize(text) print(f 句子数量: {len(sentences)}) for i, sent in enumerate(sentences[:3], 1): print(f 句子{i}: {sent[:80]}...) # 2. 单词分词和清洗 print(\\n 单词分词与清洗:) words word_tokenize(text.lower()) # 移除标点符号和数字 cleaned_words [w for w in words if w.isalpha()] print(f 原始词数: {len(words)}) print(f 清洗后词数: {len(cleaned_words)}) print(f 前20个词: {cleaned_words[:20]}) # 3. 移除停用词 stop_words set(stopwords.words(english)) filtered_words [w for w in cleaned_words if w not in stop_words] print(f\\n 移除停用词后:) print(f 词数: {len(filtered_words)}) print(f 前20个词: {filtered_words[:20]}) # 4. 词频统计 fdist FreqDist(filtered_words) print(f\\n 词频统计 (Top 15):) for word, freq in fdist.most_common(15): print(f {word}: {freq}) # 5. 可视化词频 import matplotlib.pyplot as plt top_words fdist.most_common(15) words, freqs zip(*top_words) plt.figure(figsize(12, 6)) plt.bar(words, freqs, colorskyblue) plt.xlabel(单词) plt.ylabel(频数) plt.title(NLP文本词频分布) plt.xticks(rotation45) plt.tight_layout() plt.show() # 6. 词汇多样性 vocab_size len(set(filtered_words)) total_words len(filtered_words) print(f\\n 词汇多样性:) print(f 词汇表大小: {vocab_size}) print(f 总词数: {total_words}) print(f 型例比 (TTR): {vocab_size/total_words:.3f}) print(code)tokenization_practice()四、词性标注POS Tagging4.1 词性标注基础defpos_tagging_nltk():词性标注print(\n*60)print(词性标注Part-of-Speech Tagging)print(*60)code from nltk import pos_tag from nltk.tokenize import word_tokenize # 示例文本 text The quick brown fox jumps over the lazy dog near the river bank. words word_tokenize(text) # 1. 基本词性标注 print( 基本词性标注:) tags pos_tag(words) print(f{单词:15} {标签:8} {含义}) print(- * 40) for word, tag in tags: print(f{word:15} {tag:8} {nltk.help.upenn_tagset(tag)}) # 2. 常用词性标签 print(\\n 常用词性标签:) common_tags { NN: 名词单数, NNS: 名词复数, NNP: 专有名词单数, NNPS: 专有名词复数, VB: 动词原形, VBD: 动词过去式, VBG: 动词现在分词, VBN: 动词过去分词, VBP: 动词非第三人称单数, VBZ: 动词第三人称单数, JJ: 形容词, JJR: 形容词比较级, JJS: 形容词最高级, RB: 副词, RBR: 副词比较级, RBS: 副词最高级, DT: 限定词, IN: 介词, CC: 连词, PRP: 人称代词, PRP$: 物主代词, } for tag, meaning in common_tags.items(): print(f {tag}: {meaning}) # 3. 提取特定词性 print(\\n 提取特定词性:) nouns [word for word, tag in tags if tag.startswith(NN)] verbs [word for word, tag in tags if tag.startswith(VB)] adjectives [word for word, tag in tags if tag.startswith(JJ)] adverbs [word for word, tag in tags if tag.startswith(RB)] print(f 名词: {nouns}) print(f 动词: {verbs}) print(f 形容词: {adjectives}) print(f 副词: {adverbs}) print(code)pos_tagging_nltk()4.2 词性标注实战defpos_tagging_practice():词性标注实战print(\n*60)print(词性标注实战)print(*60)code from nltk import pos_tag from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter import matplotlib.pyplot as plt # 示例文本 text \\\ Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention. Because of new computing technologies, machine learning today is not like machine learning of the past. It was born from pattern recognition and the theory that computers can learn without being programmed to perform specific tasks; researchers interested in artificial intelligence wanted to see if computers could learn from data. \\\ # 1. 对整个文本进行词性标注 print( 词性标注分析:) sentences sent_tokenize(text) all_tags [] for sent in sentences: words word_tokenize(sent) tags pos_tag(words) all_tags.extend(tags) # 2. 统计词性分布 tag_counts Counter([tag for word, tag in all_tags]) print(\\n 词性分布统计:) for tag, count in tag_counts.most_common(15): print(f {tag}: {count}次) # 3. 按句子分析 print(\\n 句子级词性分析:) for i, sent in enumerate(sentences[:3], 1): words word_tokenize(sent) tags pos_tag(words) noun_count sum(1 for w, t in tags if t.startswith(NN)) verb_count sum(1 for w, t in tags if t.startswith(VB)) adj_count sum(1 for w, t in tags if t.startswith(JJ)) print(f\\n 句子{i}:) print(f 名词: {noun_count}, 动词: {verb_count}, 形容词: {adj_count}) print(f 前5个标注: {tags[:5]}) # 4. 可视化词性分布 tags_list [tag for word, tag in all_tags] tag_counts_top tag_counts.most_common(10) tags, counts zip(*tag_counts_top) plt.figure(figsize(12, 6)) plt.bar(tags, counts, colorlightgreen) plt.xlabel(词性标签) plt.ylabel(频数) plt.title(词性分布统计) plt.xticks(rotation45) plt.tight_layout() plt.show() # 5. 提取名词短语 print(\\n 名词短语提取:) def extract_noun_phrases(tagged_sentence): 提取名词短语 noun_phrases [] current_phrase [] for word, tag in tagged_sentence: if tag.startswith(NN) or tag.startswith(JJ): current_phrase.append(word) else: if current_phrase: noun_phrases.append( .join(current_phrase)) current_phrase [] if current_phrase: noun_phrases.append( .join(current_phrase)) return noun_phrases # 对第一句话提取名词短语 first_sent sentences[0] first_words word_tokenize(first_sent) first_tags pos_tag(first_words) noun_phrases extract_noun_phrases(first_tags) print(f 第一句话: {first_sent[:100]}...) print(f 名词短语: {noun_phrases}) print(code)pos_tagging_practice()五、完整NLP管道5.1 端到端文本处理defcomplete_pipeline():完整NLP处理管道print(\n*60)print(完整NLP处理管道)print(*60)code import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk import pos_tag from nltk.corpus import stopwords from nltk.probability import FreqDist from collections import Counter class NLPAnalyzer: NLP文本分析器 def __init__(self): self.stop_words set(stopwords.words(english)) def analyze(self, text): 完整分析文本 results {} # 1. 句子分析 results[sentences] sent_tokenize(text) results[sentence_count] len(results[sentences]) # 2. 单词分析 all_words [] sentence_words [] for sent in results[sentences]: words word_tokenize(sent) sentence_words.append(words) all_words.extend(words) results[total_words] len(all_words) results[vocab_size] len(set([w.lower() for w in all_words])) # 3. 词性标注 all_tags [] for sent in results[sentences]: words word_tokenize(sent) tags pos_tag(words) all_tags.extend(tags) results[pos_tags] all_tags # 4. 词频统计 cleaned_words [w.lower() for w in all_words if w.isalpha() and w.lower() not in self.stop_words] results[word_freq] FreqDist(cleaned_words) # 5. 词性统计 tag_counts Counter([tag for word, tag in all_tags]) results[tag_counts] tag_counts return results def print_summary(self, results): 打印分析摘要 print( * 50) print(文本分析摘要) print( * 50) print(f句子数量: {results[sentence_count]}) print(f总词数: {results[total_words]}) print(f词汇量: {results[vocab_size]}) print(\\n 词性分布 (Top 10):) for tag, count in results[tag_counts].most_common(10): print(f {tag}: {count}) print(\\n 高频词汇 (Top 15):) for word, freq in results[word_freq].most_common(15): print(f {word}: {freq}) def visualize(self, results): 可视化结果 import matplotlib.pyplot as plt # 词频图 fig, axes plt.subplots(1, 2, figsize(14, 5)) # 词频分布 top_words results[word_freq].most_common(15) words, freqs zip(*top_words) axes[0].bar(words, freqs, colorskyblue) axes[0].set_xlabel(单词) axes[0].set_ylabel(频数) axes[0].set_title(高频词汇分布) axes[0].tick_params(axisx, rotation45) # 词性分布 tag_counts_top results[tag_counts].most_common(10) tags, counts zip(*tag_counts_top) axes[1].bar(tags, counts, colorlightgreen) axes[1].set_xlabel(词性标签) axes[1].set_ylabel(频数) axes[1].set_title(词性分布) axes[1].tick_params(axisx, rotation45) plt.tight_layout() plt.show() # 使用示例 analyzer NLPAnalyzer() text \\\ Natural Language Processing (NLP) is a fascinating field of artificial intelligence. It helps computers understand, interpret, and manipulate human language. NLP powers many applications including machine translation, sentiment analysis, and voice assistants like Siri and Alexa. The field has grown rapidly with the advent of deep learning and large language models. \\\ # 分析 results analyzer.analyze(text) # 打印摘要 analyzer.print_summary(results) # 可视化 analyzer.visualize(results) print(code)complete_pipeline()六、总结功能方法应用语料库nltk.corpus获取文本数据句子分词sent_tokenize分割句子单词分词word_tokenize分割单词词性标注pos_tag标注词性词频统计FreqDist分析文本NLTK学习路径掌握语料库使用学习分词技术理解词性标注组合完成NLP管道常用语料库gutenberg: 文学作品brown: 分类文本reuters: 新闻stopwords: 停用词wordnet: 词汇数据库