1. install NLTK
2. import NLTK
3. import NLTK.data
4. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # for english
5. tokenizer.tokenize(para) # para will be tokenized into sentences.
splitting sentence into words.
1. import nltk.tokenize
2. tokenize.word_tokenize("Hello World!!")
from nltk.tokenize import TreebankWordTokenizer #splits sentence into words.
obj = tokenize.TreebankWordTokenizer()
obj.tokenize("Hello World!!")
>>> from nltk.tokenize import PunktWordTokenizer
>>> tokenizer = PunktWordTokenizer()
>>> tokenizer.tokenize("Can't is a contraction.")
['Can', "'t", 'is', 'a', 'contraction.']
>>> from nltk.tokenize import WordPunctTokenizer
>>> tokenizer = WordPunctTokenizer()
>>> tokenizer.tokenize("Can't is a contraction.")
['Can', "'", 't', 'is', 'a', 'contraction', '.']
2. import NLTK
3. import NLTK.data
4. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # for english
5. tokenizer.tokenize(para) # para will be tokenized into sentences.
splitting sentence into words.
1. import nltk.tokenize
2. tokenize.word_tokenize("Hello World!!")
from nltk.tokenize import TreebankWordTokenizer #splits sentence into words.
obj = tokenize.TreebankWordTokenizer()
obj.tokenize("Hello World!!")
>>> from nltk.tokenize import PunktWordTokenizer
>>> tokenizer = PunktWordTokenizer()
>>> tokenizer.tokenize("Can't is a contraction.")
['Can', "'t", 'is', 'a', 'contraction.']
>>> from nltk.tokenize import WordPunctTokenizer
>>> tokenizer = WordPunctTokenizer()
>>> tokenizer.tokenize("Can't is a contraction.")
['Can', "'", 't', 'is', 'a', 'contraction', '.']
0 Comments