04-进阶方向：自然语言处理（NLP）——NLTK基础

张开发

• 2026/4/26 10:58:47 • 15 分钟阅读

分享文章

NLTK基础语料库、分词、词性标注一、NLTK概述1.1 NLTK简介importnltkimportnumpyasnpimportmatplotlib.pyplotaspltfrommatplotlib.patchesimportRectangle,FancyBboxPatchimportwarnings warnings.filterwarnings(ignore)print(*60)print(NLTK自然语言工具包Natural Language Toolkit)print(*60)# NLTK功能概览fig,axplt.subplots(figsize(12,8))ax.axis(off)# 中心centerplt.Circle((0.5,0.5),0.12,colorlightcoral,ecblack)ax.add_patch(center)ax.text(0.5,0.5,NLTK,hacenter,vacenter,fontsize12,fontweightbold)# 功能模块modules{语料库\n(Corpus):(0.15,0.75),分词\n(Tokenize):(0.85,0.75),词性标注\n(POS Tag):(0.15,0.25),命名实体\n(NER):(0.85,0.25),词干提取\n(Stem):(0.5,0.85),语法分析\n(Parsing):(0.5,0.15),}formodule,(x,y)inmodules.items():circleplt.Circle((x,y),0.09,colorlightblue,ecblack)ax.add_patch(circle)ax.text(x,y,module,hacenter,vacenter,fontsize7)# 连接到中心ax.annotate(,xy(x,y),xytext(0.5,0.5),arrowpropsdict(arrowstyle-,colorgray,lw1,alpha0.5))ax.set_xlim(0,1)ax.set_ylim(0,1)ax.set_title(NLTK功能模块,fontsize14)plt.tight_layout()plt.show()print(\n NLTK特点:)print( - 教育资源丰富适合教学)print( - 包含大量内置语料库)print( - 算法实现完整易于理解)print( - 学术研究常用工具)# 下载必要数据print(\n 下载NLTK数据...)try:nltk.data.find(tokenizers/punkt)print(✓ 数据已存在)exceptLookupError:nltk.download(punkt)nltk.download(averaged_perceptron_tagger)nltk.download(maxent_ne_chunker)nltk.download(words)nltk.download(stopwords)nltk.download(wordnet)nltk.download(brown)nltk.download(gutenberg)print(✓ 数据下载完成)二、语料库Corpus2.1 内置语料库defcorpora_demo():NLTK内置语料库print(\n*60)print(NLTK内置语料库)print(*60)code from nltk.corpus import gutenberg, brown, reuters, stopwords, wordnet import nltk # 1. 古腾堡语料库文学作品 print( 古腾堡语料库 (Gutenberg):) print(f 文件列表: {gutenberg.fileids()}) print(f 莎士比亚《哈姆雷特》词数: {len(gutenberg.words(shakespeare-hamlet.txt))}) print(f 《圣经》前10个词: {gutenberg.words(bible-kjv.txt)[:10]}) # 2. Brown语料库分类文本 print(\\n Brown语料库:) print(f 文本类别: {brown.categories()}) print(f 总词数: {len(brown.words())}) print(f 新闻类文本: {brown.words(categoriesnews)[:10]}) # 3. Reuters语料库新闻 print(\\n Reuters语料库:) print(f 文件数量: {len(reuters.fileids())}) print(f 类别数量: {len(reuters.categories())}) print(f 第一篇新闻标题: {reuters.words(reuters.fileids()[0])[:15]}) # 4. 停用词表 stop_words set(stopwords.words(english)) print(f\\n 停用词表 (Stopwords):) print(f 停用词数量: {len(stop_words)}) print(f 前20个停用词: {list(stop_words)[:20]}) # 5. WordNet词汇数据库 print(\\n WordNet:) print(f 同义词集数量: {len(list(wordnet.all_synsets()))}) print(f 名词数量: {len(list(wordnet.all_synsets(n)))}) print(f 动词数量: {len(list(wordnet.all_synsets(v)))}) print(code)corpora_demo()2.2 语料库操作defcorpus_operations():语料库操作print(\n*60)print(语料库基本操作)print(*60)code from nltk.corpus import gutenberg, brown from nltk.probability import FreqDist import matplotlib.pyplot as plt # 1. 语料库统计 print( 语料库统计:) print(f Brown语料库词数: {len(brown.words()):,}) print(f Brown语料库句子数: {len(brown.sents()):,}) # 2. 词频统计 hamlet_words gutenberg.words(shakespeare-hamlet.txt) fdist FreqDist(hamlet_words) print(\\n 词频统计 (哈姆雷特):) print(f 词汇表大小: {len(fdist)}) print(f 最常见10个词: {fdist.most_common(10)}) # 3. 词长分布 word_lengths [len(w) for w in hamlet_words if w.isalpha()] print(f\\n 词长分布:) print(f 平均词长: {sum(word_lengths)/len(word_lengths):.2f}) print(f 最长词: {max(hamlet_words, keylen)}) # 4. 不同类别的词汇差异 news_words brown.words(categoriesnews) fiction_words brown.words(categoriesfiction) news_vocab set(news_words) fiction_vocab set(fiction_words) print(f\\n 类别词汇对比:) print(f 新闻类词汇量: {len(news_vocab)}) print(f 小说类词汇量: {len(fiction_vocab)}) print(f 共有词汇: {len(news_vocab fiction_vocab)}) print(f 新闻特有: {len(news_vocab - fiction_vocab)}) print(f 小说特有: {len(fiction_vocab - news_vocab)}) # 5. 可视化词频 # 绘制词频分布图 top_words fdist.most_common(20) words, freqs zip(*top_words) plt.figure(figsize(12, 6)) plt.bar(words, freqs) plt.xlabel(单词) plt.ylabel(频数) plt.title(哈姆雷特词频分布Top 20) plt.xticks(rotation45) plt.tight_layout() plt.show() print(code)corpus_operations()2.3 自定义语料库defcustom_corpus_nltk():自定义语料库print(\n*60)print(自定义语料库)print(*60)code from nltk.corpus import PlaintextCorpusReader import os # 创建自定义语料库目录 corpus_root ./my_corpus os.makedirs(corpus_root, exist_okTrue) # 创建示例文件 doc1_content \\\ Natural Language Processing (NLP) is a subfield of artificial intelligence. It focuses on the interaction between computers and human language. NLP applications include machine translation, sentiment analysis, and chatbots. \\\ doc2_content \\\ Machine Learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data. \\\ with open(f{corpus_root}/nlp.txt, w) as f: f.write(doc1_content) with open(f{corpus_root}/ml.txt, w) as f: f.write(doc2_content) # 加载自定义语料库 reader PlaintextCorpusReader(corpus_root, .*\.txt) print( 自定义语料库:) print(f 文件列表: {reader.fileids()}) print(f 总词数: {len(reader.words())}) print(f 总句子数: {len(reader.sents())}) # 统计分析 print(\\n 文件统计:) for fileid in reader.fileids(): words reader.words(fileid) sents reader.sents(fileid) print(f {fileid}: {len(words)}词, {len(sents)}句) # 词频统计 all_words reader.words() fdist FreqDist(all_words) print(f\\n 整体词频 (Top 10):) for word, freq in fdist.most_common(10): print(f {word}: {freq}) # 保存处理后的语料 with open(f{corpus_root}/processed.txt, w) as f: for sent in reader.sents(): f.write( .join(sent) \\n) print(f\\n✓ 处理后语料已保存) print(code)custom_corpus_nltk()三、分词Tokenization3.1 句子分词与单词分词deftokenization_nltk():NLTK分词print(\n*60)print(分词Tokenization)print(*60)code import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer from nltk.tokenize import WhitespaceTokenizer, TweetTokenizer # 示例文本 text Hello world! This is NLTK. Its a great library for NLP. Dr. Smith said: NLP is amazing! # 1. 句子分词 print( 句子分词 (sent_tokenize):) sentences sent_tokenize(text) for i, sent in enumerate(sentences, 1): print(f 句子{i}: {sent}) # 2. 单词分词 print(\\n 单词分词 (word_tokenize):) words word_tokenize(text) print(f 结果: {words}) # 3. 标点分词 print(\\n 标点分词 (WordPunctTokenizer):) tokenizer WordPunctTokenizer() punct_tokens tokenizer.tokenize(text) print(f 结果: {punct_tokens}) # 4. 正则表达式分词 print(\\n 正则分词 (RegexpTokenizer):) # 只保留字母和数字 tokenizer RegexpTokenizer(r\\w) regex_tokens tokenizer.tokenize(text) print(f 结果: {regex_tokens}) # 5. 空白分词 print(\\n 空白分词 (WhitespaceTokenizer):) tokenizer WhitespaceTokenizer() ws_tokens tokenizer.tokenize(text) print(f 结果: {ws_tokens}) # 6. 推特分词处理表情、话题等 print(\\n 推特分词 (TweetTokenizer):) tweet I love Python! #NLP user https://example.com tokenizer TweetTokenizer() tweet_tokens tokenizer.tokenize(tweet) print(f 结果: {tweet_tokens}) # 7. 不同语言的分词 print(\\n 中文分词示例:) chinese_text 自然语言处理是人工智能的重要分支广泛应用于机器翻译、情感分析等领域。 chinese_tokens word_tokenize(chinese_text) print(f 中文分词结果: {chinese_tokens}) print(code)tokenization_nltk()3.2 分词实战deftokenization_practice():分词实战print(\n*60)print(分词实战)print(*60)code from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.probability import FreqDist import string # 加载文本 text \\\ Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language. It is used to analyze text, allowing machines to understand how humans speak. NLP is used in many real-world applications like: - Machine Translation (Google Translate) - Sentiment Analysis (Social Media Monitoring) - Chatbots and Virtual Assistants (Siri, Alexa) - Text Summarization - Speech Recognition \\\ # 1. 基础分词 print( 基础分词:) sentences sent_tokenize(text) print(f 句子数量: {len(sentences)}) for i, sent in enumerate(sentences[:3], 1): print(f 句子{i}: {sent[:80]}...) # 2. 单词分词和清洗 print(\\n 单词分词与清洗:) words word_tokenize(text.lower()) # 移除标点符号和数字 cleaned_words [w for w in words if w.isalpha()] print(f 原始词数: {len(words)}) print(f 清洗后词数: {len(cleaned_words)}) print(f 前20个词: {cleaned_words[:20]}) # 3. 移除停用词 stop_words set(stopwords.words(english)) filtered_words [w for w in cleaned_words if w not in stop_words] print(f\\n 移除停用词后:) print(f 词数: {len(filtered_words)}) print(f 前20个词: {filtered_words[:20]}) # 4. 词频统计 fdist FreqDist(filtered_words) print(f\\n 词频统计 (Top 15):) for word, freq in fdist.most_common(15): print(f {word}: {freq}) # 5. 可视化词频 import matplotlib.pyplot as plt top_words fdist.most_common(15) words, freqs zip(*top_words) plt.figure(figsize(12, 6)) plt.bar(words, freqs, colorskyblue) plt.xlabel(单词) plt.ylabel(频数) plt.title(NLP文本词频分布) plt.xticks(rotation45) plt.tight_layout() plt.show() # 6. 词汇多样性 vocab_size len(set(filtered_words)) total_words len(filtered_words) print(f\\n 词汇多样性:) print(f 词汇表大小: {vocab_size}) print(f 总词数: {total_words}) print(f 型例比 (TTR): {vocab_size/total_words:.3f}) print(code)tokenization_practice()四、词性标注POS Tagging4.1 词性标注基础defpos_tagging_nltk():词性标注print(\n*60)print(词性标注Part-of-Speech Tagging)print(*60)code from nltk import pos_tag from nltk.tokenize import word_tokenize # 示例文本 text The quick brown fox jumps over the lazy dog near the river bank. words word_tokenize(text) # 1. 基本词性标注 print( 基本词性标注:) tags pos_tag(words) print(f{单词:15} {标签:8} {含义}) print(- * 40) for word, tag in tags: print(f{word:15} {tag:8} {nltk.help.upenn_tagset(tag)}) # 2. 常用词性标签 print(\\n 常用词性标签:) common_tags { NN: 名词单数, NNS: 名词复数, NNP: 专有名词单数, NNPS: 专有名词复数, VB: 动词原形, VBD: 动词过去式, VBG: 动词现在分词, VBN: 动词过去分词, VBP: 动词非第三人称单数, VBZ: 动词第三人称单数, JJ: 形容词, JJR: 形容词比较级, JJS: 形容词最高级, RB: 副词, RBR: 副词比较级, RBS: 副词最高级, DT: 限定词, IN: 介词, CC: 连词, PRP: 人称代词, PRP$: 物主代词, } for tag, meaning in common_tags.items(): print(f {tag}: {meaning}) # 3. 提取特定词性 print(\\n 提取特定词性:) nouns [word for word, tag in tags if tag.startswith(NN)] verbs [word for word, tag in tags if tag.startswith(VB)] adjectives [word for word, tag in tags if tag.startswith(JJ)] adverbs [word for word, tag in tags if tag.startswith(RB)] print(f 名词: {nouns}) print(f 动词: {verbs}) print(f 形容词: {adjectives}) print(f 副词: {adverbs}) print(code)pos_tagging_nltk()4.2 词性标注实战defpos_tagging_practice():词性标注实战print(\n*60)print(词性标注实战)print(*60)code from nltk import pos_tag from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter import matplotlib.pyplot as plt # 示例文本 text \\\ Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention. Because of new computing technologies, machine learning today is not like machine learning of the past. It was born from pattern recognition and the theory that computers can learn without being programmed to perform specific tasks; researchers interested in artificial intelligence wanted to see if computers could learn from data. \\\ # 1. 对整个文本进行词性标注 print( 词性标注分析:) sentences sent_tokenize(text) all_tags [] for sent in sentences: words word_tokenize(sent) tags pos_tag(words) all_tags.extend(tags) # 2. 统计词性分布 tag_counts Counter([tag for word, tag in all_tags]) print(\\n 词性分布统计:) for tag, count in tag_counts.most_common(15): print(f {tag}: {count}次) # 3. 按句子分析 print(\\n 句子级词性分析:) for i, sent in enumerate(sentences[:3], 1): words word_tokenize(sent) tags pos_tag(words) noun_count sum(1 for w, t in tags if t.startswith(NN)) verb_count sum(1 for w, t in tags if t.startswith(VB)) adj_count sum(1 for w, t in tags if t.startswith(JJ)) print(f\\n 句子{i}:) print(f 名词: {noun_count}, 动词: {verb_count}, 形容词: {adj_count}) print(f 前5个标注: {tags[:5]}) # 4. 可视化词性分布 tags_list [tag for word, tag in all_tags] tag_counts_top tag_counts.most_common(10) tags, counts zip(*tag_counts_top) plt.figure(figsize(12, 6)) plt.bar(tags, counts, colorlightgreen) plt.xlabel(词性标签) plt.ylabel(频数) plt.title(词性分布统计) plt.xticks(rotation45) plt.tight_layout() plt.show() # 5. 提取名词短语 print(\\n 名词短语提取:) def extract_noun_phrases(tagged_sentence): 提取名词短语 noun_phrases [] current_phrase [] for word, tag in tagged_sentence: if tag.startswith(NN) or tag.startswith(JJ): current_phrase.append(word) else: if current_phrase: noun_phrases.append( .join(current_phrase)) current_phrase [] if current_phrase: noun_phrases.append( .join(current_phrase)) return noun_phrases # 对第一句话提取名词短语 first_sent sentences[0] first_words word_tokenize(first_sent) first_tags pos_tag(first_words) noun_phrases extract_noun_phrases(first_tags) print(f 第一句话: {first_sent[:100]}...) print(f 名词短语: {noun_phrases}) print(code)pos_tagging_practice()五、完整NLP管道5.1 端到端文本处理defcomplete_pipeline():完整NLP处理管道print(\n*60)print(完整NLP处理管道)print(*60)code import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk import pos_tag from nltk.corpus import stopwords from nltk.probability import FreqDist from collections import Counter class NLPAnalyzer: NLP文本分析器 def __init__(self): self.stop_words set(stopwords.words(english)) def analyze(self, text): 完整分析文本 results {} # 1. 句子分析 results[sentences] sent_tokenize(text) results[sentence_count] len(results[sentences]) # 2. 单词分析 all_words [] sentence_words [] for sent in results[sentences]: words word_tokenize(sent) sentence_words.append(words) all_words.extend(words) results[total_words] len(all_words) results[vocab_size] len(set([w.lower() for w in all_words])) # 3. 词性标注 all_tags [] for sent in results[sentences]: words word_tokenize(sent) tags pos_tag(words) all_tags.extend(tags) results[pos_tags] all_tags # 4. 词频统计 cleaned_words [w.lower() for w in all_words if w.isalpha() and w.lower() not in self.stop_words] results[word_freq] FreqDist(cleaned_words) # 5. 词性统计 tag_counts Counter([tag for word, tag in all_tags]) results[tag_counts] tag_counts return results def print_summary(self, results): 打印分析摘要 print( * 50) print(文本分析摘要) print( * 50) print(f句子数量: {results[sentence_count]}) print(f总词数: {results[total_words]}) print(f词汇量: {results[vocab_size]}) print(\\n 词性分布 (Top 10):) for tag, count in results[tag_counts].most_common(10): print(f {tag}: {count}) print(\\n 高频词汇 (Top 15):) for word, freq in results[word_freq].most_common(15): print(f {word}: {freq}) def visualize(self, results): 可视化结果 import matplotlib.pyplot as plt # 词频图 fig, axes plt.subplots(1, 2, figsize(14, 5)) # 词频分布 top_words results[word_freq].most_common(15) words, freqs zip(*top_words) axes[0].bar(words, freqs, colorskyblue) axes[0].set_xlabel(单词) axes[0].set_ylabel(频数) axes[0].set_title(高频词汇分布) axes[0].tick_params(axisx, rotation45) # 词性分布 tag_counts_top results[tag_counts].most_common(10) tags, counts zip(*tag_counts_top) axes[1].bar(tags, counts, colorlightgreen) axes[1].set_xlabel(词性标签) axes[1].set_ylabel(频数) axes[1].set_title(词性分布) axes[1].tick_params(axisx, rotation45) plt.tight_layout() plt.show() # 使用示例 analyzer NLPAnalyzer() text \\\ Natural Language Processing (NLP) is a fascinating field of artificial intelligence. It helps computers understand, interpret, and manipulate human language. NLP powers many applications including machine translation, sentiment analysis, and voice assistants like Siri and Alexa. The field has grown rapidly with the advent of deep learning and large language models. \\\ # 分析 results analyzer.analyze(text) # 打印摘要 analyzer.print_summary(results) # 可视化 analyzer.visualize(results) print(code)complete_pipeline()六、总结功能方法应用语料库nltk.corpus获取文本数据句子分词sent_tokenize分割句子单词分词word_tokenize分割单词词性标注pos_tag标注词性词频统计FreqDist分析文本NLTK学习路径掌握语料库使用学习分词技术理解词性标注组合完成NLP管道常用语料库gutenberg: 文学作品brown: 分类文本reuters: 新闻stopwords: 停用词wordnet: 词汇数据库

04-进阶方向：自然语言处理（NLP）——NLTK基础

最新文章

不愧是DeepSeek！V4一手实测：推理编程能力给到夯，熟悉的D老师也回来了

从自由建模到精确设计：3步掌握Blender CAD约束绘图新范式

踩坑总结：poi-tl处理Word表格循环时，样式丢失和格式错乱怎么破？

从‘炼丹’到‘精算’：用PyTorch Profiler和TensorBoard可视化你的GPU显存去哪了

5种实用指南：如何高效管理编程语言图标资源库

微信小程序图片裁剪革命性解决方案：we-cropper实战全攻略

推荐文章

跨国团队必备：3步将飞书国际版文档转换为Markdown

Python测试中如何控制顺序_使用pytest-ordering自定义执行流

Python 协程池任务分发与错误处理

TestDisk PhotoRec数据恢复完整指南：5步高效找回丢失分区与文件

Moonlight-Switch技术方案：在Switch上实现专业级PC游戏串流

AAEON无风扇触控面板电脑在工业自动化中的应用

相关文章

如何为AMD 780M APU解锁2-3倍AI性能？ROCmLibs-for-gfx1103终极优化指南

企业内网必看：用U盘搞定Ubuntu服务器Docker离线部署（含依赖树分析）

OpenCode智能编程助手全面部署指南：从环境搭建到高级应用

大语言模型背后的秘密：从预训练到微调，揭秘LLM高效训练的核心技术（含QLoRA/ZeRO实战）

RBDdimmer：嵌入式AC相位调光库详解

新手零失败指南：利用快马ai轻松完成openclaw的ubuntu环境搭建

分享文章

更多文章

终极指南：5分钟快速掌握DamaiHelper多平台抢票脚本

HS2终极增强指南：解锁Honey Select 2完整游戏体验的完整解决方案

别再死记硬背公式了！用Python+Matplotlib动态可视化二阶系统响应曲线（附代码）

微信自动化终极指南：用wxauto构建高效消息处理系统

PyQt5开发避坑指南：VSCode配置QtDesigner时，90%新手会遇到的路径问题与解决方案

安卓13时代，如何绕过应用检测？深入AOSP源码修改定位与设备信息的实战指南

G1垃圾收集器四大关键机制原理详细描述

BilibiliDown：3分钟学会下载B站视频的跨平台神器

04月26日AI每日参考：Google豪掷400亿押注Anthropic，DeepSeek V4开源发布

保姆级教程：用ENSP模拟企业网，三层交换+路由器+NAT搞定内外网互通

Ninapro数据集预处理避坑指南：解决标签错乱、键值缺失与信号滤波的3个实战问题

告别混乱！用MD04/MD07/ZMD06看懂SAP物料可用性，采购与生产计划不再抓瞎