public:t-malv-15-3:4
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| public:t-malv-15-3:4 [2015/09/10 09:17] – [5. Working with Bigram scores] orvark | public:t-malv-15-3:4 [2024/04/29 13:33] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 19: | Line 19: | ||
| * Create a Bigram Collocation Finder for the Brown Corpus. | * Create a Bigram Collocation Finder for the Brown Corpus. | ||
| * Apply a filter to remove bigrams that occur less than two times. | * Apply a filter to remove bigrams that occur less than two times. | ||
| - | * Apply a filter to remove stopwords ('' | + | * Apply a filter to remove |
| * Print out the 20 most **frequent bigrams**. | * Print out the 20 most **frequent bigrams**. | ||
| Line 62: | Line 62: | ||
| ===== 5. Working with Bigram scores | ===== 5. Working with Bigram scores | ||
| - | The following code snippet shows an example of how you can use '' | + | The following code snippet shows an example of how you can use '' |
| <code python> | <code python> | ||
| Line 73: | Line 73: | ||
| scored = finder.score_ngrams( | scored = finder.score_ngrams( | ||
| nltk.collocations.BigramAssocMeasures.likelihood_ratio) | nltk.collocations.BigramAssocMeasures.likelihood_ratio) | ||
| + | |||
| + | #create a defaultdict of lists | ||
| + | prev_word = defaultdict(list) | ||
| #group by first word in bigram | #group by first word in bigram | ||
| - | prev_word = defaultdict(list) #a defaultdict of lists | ||
| for key, scores in scored: | for key, scores in scored: | ||
| | | ||
| Line 92: | Line 94: | ||
| </ | </ | ||
| - | FYI: a normal Python dictionary throws a '' | + | FYI: a normal Python dictionary throws a '' |
| + | |||
| + | ===== Solutions | ||
| + | |||
| + | <code python> | ||
| + | import nltk | ||
| + | from nltk.collocations import * | ||
| + | from nltk.metrics import BigramAssocMeasures, | ||
| + | from nltk.corpus import brown, stopwords | ||
| + | |||
| + | #1 | ||
| + | |||
| + | bam = BigramAssocMeasures | ||
| + | |||
| + | corpus = brown.words() | ||
| + | |||
| + | finder = BigramCollocationFinder.from_words(corpus) | ||
| + | |||
| + | word_filter = lambda w: len(w) < 3 or w.lower() in stopwords.words(' | ||
| + | #def word_filter(w): | ||
| + | |||
| + | |||
| + | finder.apply_freq_filter(2) | ||
| + | finder.apply_word_filter(word_filter) | ||
| + | |||
| + | print(finder.nbest(bam.raw_freq, | ||
| + | |||
| + | |||
| + | finder_win3 = BigramCollocationFinder.from_words(corpus, | ||
| + | finder_win3.apply_freq_filter(2) | ||
| + | finder_win3.apply_word_filter(word_filter) | ||
| + | print(finder_win3.nbest(bam.raw_freq, | ||
| + | |||
| + | |||
| + | tam = TrigramAssocMeasures | ||
| + | |||
| + | finder_tri = TrigramCollocationFinder.from_words(corpus) | ||
| + | finder_tri.apply_freq_filter(2) | ||
| + | finder_tri.apply_word_filter(word_filter) | ||
| + | print(finder_tri.nbest(tam.raw_freq, | ||
| + | |||
| + | #2 | ||
| + | |||
| + | # Pointwise mutal information | ||
| + | print(finder.nbest(bam.pmi, | ||
| + | # Log-likelihood ratio | ||
| + | print(finder.nbest(bam.likelihood_ratio, | ||
| + | # Mutal information likelihood, a mi variant | ||
| + | print(finder.nbest(bam.mi_like, | ||
| + | # Chi squared test | ||
| + | print(finder.nbest(bam.chi_sq, | ||
| + | # Student' | ||
| + | print(finder.nbest(bam.student_t, | ||
| + | |||
| + | #3 | ||
| + | |||
| + | tagged_corpus = brown.tagged_words(tagset=' | ||
| + | |||
| + | finder_tagged = BigramCollocationFinder.from_words(tagged_corpus) | ||
| + | print(finder_tagged.nbest(bam.raw_freq, | ||
| + | |||
| + | finder_tags = BigramCollocationFinder.from_words(t for w, t in tagged_corpus) | ||
| + | print(finder_tags.nbest(bam.raw_freq, | ||
| + | </ | ||
/var/www/cadia.ru.is/wiki/data/attic/public/t-malv-15-3/4.1441876678.txt.gz · Last modified: 2024/04/29 13:32 (external edit)