แบบฝึกหัด: การประมวลผลข้อความขั้นพื้นฐาน

แบบฝึกหัด: การประมวลผลข้อความขั้นพื้นฐาน#

สรุปคำสั่งที่ควรทราบ#

การติดตั้งไลบรารี pythainlp attacut และ nltk ก่อนใช้งาน#

!pip install pythainlp
!pip install attacut
!pip install nltk

การตัดคำภาษาไทย#

import pythainlp
import attacut
pythainlp.word_tokenize('ข้างนอกสุกใส ข้างในต๊ะติ๊งโหน่ง')

['ข้างนอก', 'สุกใส', ' ', 'ข้างใน', 'ต๊ะติ๊ง', 'โหน่ง']

pythainlp.word_tokenize('ข้างนอกสุกใส ข้างในต๊ะติ๊งโหน่ง', engine='attacut')

['ข้าง', 'นอก', 'สุกใส', ' ', 'ข้าง', 'ใน', 'ต๊ะติ๊งโหน่ง']

ตัดคำภาษาอังกฤษ#

import nltk
nltk.download('punkt_tab')

nltk.tokenize.word_tokenize('COVID-19, which caused problems worldwide, is still a problem in the U.S.A. until today.')

['COVID-19',
 ',',
 'which',
 'caused',
 'problems',
 'worldwide',
 ',',
 'is',
 'still',
 'a',
 'problem',
 'in',
 'the',
 'U.S.A.',
 'until',
 'today',
 '.']

ตัดประโยคภาษาไทย#

pythainlp.sent_tokenize('ระฆังดี ถึงแม้คนไม่ตีก็ดัง ระฆังไม่ดีไม่ตีก็ไม่ดัง')

['ระฆังดี ', 'ถึงแม้คนไม่ตีก็ดัง ', 'ระฆังไม่ดีไม่ตีก็ไม่ดัง']

ตัดประโยคภาษาอังกฤษ#

nltk.sent_tokenize('Punkt knows that the periods in Mr. Smith and Johann S. Bach are not sentence boundaries.  But he was with Mrs. Bond that week.')

['Punkt knows that the periods in Mr. Smith and Johann S. Bach are not sentence boundaries.',
 'But he was with Mrs.',
 'Bond that week.']

โหลดคำหยุดภาษาไทย#

tokens = pythainlp.word_tokenize('ข้างนอกสุกใส ข้่างในต๊ะติ๊งโหน่ง')
stopset = set(pythainlp.corpus.thai_stopwords())
tokens_no_stopwords = [t for t in tokens if t not in stopset]
print(tokens_no_stopwords)

['ข้างนอก', 'สุกใส', ' ', 'ข้่าง', 'ต๊ะติ๊ง', 'โหน่ง']

โหลดคำหยุดภาษาอังกฤษ#

nltk.download('stopwords')

tokens = nltk.word_tokenize('COVID-19, which caused problems worldwide, is still a problem in the U.S.A. until today.')
stopset = set(nltk.corpus.stopwords.words('english'))
tokens_no_stopwords = [t for t in tokens if t not in stopset]
print(tokens_no_stopwords)

['COVID-19', ',', 'caused', 'problems', 'worldwide', ',', 'still', 'problem', 'U.S.A.', 'today', '.']

กรองเอาเครื่องหมายวรรคตอนออก#

import re
patt = re.compile('[^ก-์0-9a-zA-Z]')
tokens_no_punct = [t for t in tokens if not patt.match(t)]
print(tokens_no_punct)

['COVID-19', 'which', 'caused', 'problems', 'worldwide', 'is', 'still', 'a', 'problem', 'in', 'the', 'U.S.A.', 'until', 'today']

แบบฝึกหัดการวิเคราะห์ความถี่ของคำ#

เขียนฟังก์ชันที่นับจำนวนคำภาษาไทยจากข้อความที่เก็บอยู่ในสตริง news_article โดยให้ใช้ข่าวที่มาจาก https://thestandard.co/us-prepares-to-offer-billionaire-tax/ หรือดาวน์โหลดจาก https://github.com/attapol/programming-nlp-book/tree/main/module7/data/sample-news-article.txt ได้โดยตรง หรือใช้ข้อความจากแหล่งอื่นที่ความยาวเกิน 1000 คำทดแทนได้

สตริงที่ให้มีขนาดคลังศัพท์ (จำนวนคำแบบไม่นับคำซ้ำ) อยู่ที่เท่าไร
คำใดพบบ่อยที่สุด 10 อันดับในสตริงที่ให้มา โดยที่ไม่ต้องกรองเอาคำหยุดออก
ไบแกรมใดพบบ่อยที่สุด 10 อันดับในสตริงที่ให้มา โดยที่ไม่ต้องกรองเอาคำหยุดออก
คำใดพบบ่อยที่สุด 10 อันดับในสตริงที่ให้มา โดยที่กรองเอาคำหยุดออก และเปรียบเทียบผล
สร้างเมฆคำจากคำที่พบบ่อยที่สุด 40 อันดับ ให้ปรับขนาดของคำให้เท่ากับความถี่ของคำนั้นๆ ไลบรารีเมฆคำไม่มีฟอนต์ภาษาไทยมาให้ เพราะฉะนั้นเราต้องดาวน์โหลดฟอนต์ภาษาไทยมาวางที่โฟลเดอร์เดียวกัน และตั้งค่า font_path= ให้ชี้ไปยังไฟล์ที่เก็บฟอนต์ไทย