Search bar for issues/pulls (#530)

This commit is contained in:
Ethan Koenig 2017-01-24 21:43:02 -05:00 committed by Lunny Xiao
parent 8bc431952f
commit 833f8b94c2
195 changed files with 221830 additions and 60 deletions

19
vendor/github.com/blevesearch/go-porterstemmer/LICENSE generated vendored Normal file
View file

@ -0,0 +1,19 @@
Copyright (c) 2013 Charles Iliya Krempeaux <charles@reptile.ca> :: http://changelog.ca/
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,118 @@
# This fork...
I'm maintaining this fork because the original author was not replying to issues or pull requests. For now I plan on maintaining this fork as necessary.
## Status
[![Build Status](https://travis-ci.org/blevesearch/go-porterstemmer.svg?branch=master)](https://travis-ci.org/blevesearch/go-porterstemmer)
[![Coverage Status](https://coveralls.io/repos/blevesearch/go-porterstemmer/badge.png?branch=HEAD)](https://coveralls.io/r/blevesearch/go-porterstemmer?branch=HEAD)
# Go Porter Stemmer
A native Go clean room implementation of the Porter Stemming Algorithm.
This algorithm is of interest to people doing Machine Learning or
Natural Language Processing (NLP).
This is NOT a port. This is a native Go implementation from the human-readable
description of the algorithm.
I've tried to make it (more) efficient by NOT internally using string's, but
instead internally using []rune's and using the same (array) buffer used by
the []rune slice (and sub-slices) at all steps of the algorithm.
For Porter Stemmer algorithm, see:
http://tartarus.org/martin/PorterStemmer/def.txt (URL #1)
http://tartarus.org/martin/PorterStemmer/ (URL #2)
# Departures
Also, since when I initially implemented it, it failed the tests at...
http://tartarus.org/martin/PorterStemmer/voc.txt (URL #3)
http://tartarus.org/martin/PorterStemmer/output.txt (URL #4)
... after reading the human-readble text over and over again to try to figure out
what the error I made was (and doing all sorts of things to debug it) I came to the
conclusion that the some of these tests were wrong according to the human-readable
description of the algorithm.
This led me to wonder if maybe other people's code that was passing these tests had
rules that were not in the human-readable description. Which led me to look at the source
code here...
http://tartarus.org/martin/PorterStemmer/c.txt (URL #5)
... When I looked there I noticed that there are some items marked as a "DEPARTURE",
which differ from the original algorithm. (There are 2 of these.)
I implemented these departures, and the tests at URL #3 and URL #4 all passed.
## Usage
To use this Golang library, use with something like:
package main
import (
"fmt"
"github.com/reiver/go-porterstemmer"
)
func main() {
word := "Waxes"
stem := porterstemmer.StemString(word)
fmt.Printf("The word [%s] has the stem [%s].\n", word, stem)
}
Alternatively, if you want to be a bit more efficient, use []rune slices instead, with code like:
package main
import (
"fmt"
"github.com/reiver/go-porterstemmer"
)
func main() {
word := []rune("Waxes")
stem := porterstemmer.Stem(word)
fmt.Printf("The word [%s] has the stem [%s].\n", string(word), string(stem))
}
Although NOTE that the above code may modify original slice (named "word" in the example) as a side
effect, for efficiency reasons. And that the slice named "stem" in the example above may be a
sub-slice of the slice named "word".
Also alternatively, if you already know that your word is already lowercase (and you don't need
this library to lowercase your word for you) you can instead use code like:
package main
import (
"fmt"
"github.com/reiver/go-porterstemmer"
)
func main() {
word := []rune("waxes")
stem := porterstemmer.StemWithoutLowerCasing(word)
fmt.Printf("The word [%s] has the stem [%s].\n", string(word), string(stem))
}
Again NOTE (like with the previous example) that the above code may modify original slice (named
"word" in the example) as a side effect, for efficiency reasons. And that the slice named "stem"
in the example above may be a sub-slice of the slice named "word".

View file

@ -0,0 +1,839 @@
package porterstemmer
import (
// "log"
"unicode"
)
func isConsonant(s []rune, i int) bool {
//DEBUG
//log.Printf("isConsonant: [%+v]", string(s[i]))
result := true
switch s[i] {
case 'a', 'e', 'i', 'o', 'u':
result = false
case 'y':
if 0 == i {
result = true
} else {
result = !isConsonant(s, i-1)
}
default:
result = true
}
return result
}
func measure(s []rune) uint {
// Initialize.
lenS := len(s)
result := uint(0)
i := 0
// Short Circuit.
if 0 == lenS {
/////////// RETURN
return result
}
// Ignore (potential) consonant sequence at the beginning of word.
for isConsonant(s, i) {
//DEBUG
//log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i]))
i++
if i >= lenS {
/////////////// RETURN
return result
}
}
// For each pair of a vowel sequence followed by a consonant sequence, increment result.
Outer:
for i < lenS {
for !isConsonant(s, i) {
//DEBUG
//log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i]))
i++
if i >= lenS {
/////////// BREAK
break Outer
}
}
for isConsonant(s, i) {
//DEBUG
//log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i]))
i++
if i >= lenS {
result++
/////////// BREAK
break Outer
}
}
result++
}
// Return
return result
}
func hasSuffix(s, suffix []rune) bool {
lenSMinusOne := len(s) - 1
lenSuffixMinusOne := len(suffix) - 1
if lenSMinusOne <= lenSuffixMinusOne {
return false
} else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice.
/////// RETURN
return false
} else {
for i := 0; i < lenSuffixMinusOne; i++ {
if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] {
/////////////// RETURN
return false
}
}
}
return true
}
func containsVowel(s []rune) bool {
lenS := len(s)
for i := 0; i < lenS; i++ {
if !isConsonant(s, i) {
/////////// RETURN
return true
}
}
return false
}
func hasRepeatDoubleConsonantSuffix(s []rune) bool {
// Initialize.
lenS := len(s)
result := false
// Do it!
if 2 > lenS {
result = false
} else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"?
result = true
} else {
result = false
}
// Return,
return result
}
func hasConsonantVowelConsonantSuffix(s []rune) bool {
// Initialize.
lenS := len(s)
result := false
// Do it!
if 3 > lenS {
result = false
} else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) {
result = true
} else {
result = false
}
// Return
return result
}
func step1a(s []rune) []rune {
// Initialize.
var result []rune = s
lenS := len(s)
// Do it!
if suffix := []rune("sses"); hasSuffix(s, suffix) {
lenTrim := 2
subSlice := s[:lenS-lenTrim]
result = subSlice
} else if suffix := []rune("ies"); hasSuffix(s, suffix) {
lenTrim := 2
subSlice := s[:lenS-lenTrim]
result = subSlice
} else if suffix := []rune("ss"); hasSuffix(s, suffix) {
result = s
} else if suffix := []rune("s"); hasSuffix(s, suffix) {
lenSuffix := 1
subSlice := s[:lenS-lenSuffix]
result = subSlice
}
// Return.
return result
}
func step1b(s []rune) []rune {
// Initialize.
var result []rune = s
lenS := len(s)
// Do it!
if suffix := []rune("eed"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 0 < m {
lenTrim := 1
result = s[:lenS-lenTrim]
}
} else if suffix := []rune("ed"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
if containsVowel(subSlice) {
if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
} else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
} else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
} else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
lenTrim := 1
lenSubSlice := len(subSlice)
result = subSlice[:lenSubSlice-lenTrim]
} else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
result[len(result)-1] = 'e'
} else {
result = subSlice
}
}
} else if suffix := []rune("ing"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
if containsVowel(subSlice) {
if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
result[len(result)-1] = 'e'
} else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
result[len(result)-1] = 'e'
} else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
result[len(result)-1] = 'e'
} else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
lenTrim := 1
lenSubSlice := len(subSlice)
result = subSlice[:lenSubSlice-lenTrim]
} else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
lenTrim := -1
result = s[:lenS-lenSuffix-lenTrim]
result[len(result)-1] = 'e'
} else {
result = subSlice
}
}
}
// Return.
return result
}
func step1c(s []rune) []rune {
// Initialize.
lenS := len(s)
result := s
// Do it!
if 2 > lenS {
/////////// RETURN
return result
}
if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
result[lenS-1] = 'i'
} else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
result[lenS-1] = 'I'
}
// Return.
return result
}
func step2(s []rune) []rune {
// Initialize.
lenS := len(s)
result := s
// Do it!
if suffix := []rune("ational"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-5] = 'e'
result = result[:lenS-4]
}
} else if suffix := []rune("tional"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = result[:lenS-2]
}
} else if suffix := []rune("enci"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-1] = 'e'
}
} else if suffix := []rune("anci"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-1] = 'e'
}
} else if suffix := []rune("izer"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-1]
}
} else if suffix := []rune("bli"); hasSuffix(s, suffix) { // --DEPARTURE--
// } else if suffix := []rune("abli") ; hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-1] = 'e'
}
} else if suffix := []rune("alli"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-2]
}
} else if suffix := []rune("entli"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-2]
}
} else if suffix := []rune("eli"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-2]
}
} else if suffix := []rune("ousli"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-2]
}
} else if suffix := []rune("ization"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-5] = 'e'
result = s[:lenS-4]
}
} else if suffix := []rune("ation"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-3] = 'e'
result = s[:lenS-2]
}
} else if suffix := []rune("ator"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-2] = 'e'
result = s[:lenS-1]
}
} else if suffix := []rune("alism"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-3]
}
} else if suffix := []rune("iveness"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-4]
}
} else if suffix := []rune("fulness"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-4]
}
} else if suffix := []rune("ousness"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-4]
}
} else if suffix := []rune("aliti"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result = s[:lenS-3]
}
} else if suffix := []rune("iviti"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-3] = 'e'
result = result[:lenS-2]
}
} else if suffix := []rune("biliti"); hasSuffix(s, suffix) {
if 0 < measure(s[:lenS-len(suffix)]) {
result[lenS-5] = 'l'
result[lenS-4] = 'e'
result = result[:lenS-3]
}
} else if suffix := []rune("logi"); hasSuffix(s, suffix) { // --DEPARTURE--
if 0 < measure(s[:lenS-len(suffix)]) {
lenTrim := 1
result = s[:lenS-lenTrim]
}
}
// Return.
return result
}
func step3(s []rune) []rune {
// Initialize.
lenS := len(s)
result := s
// Do it!
if suffix := []rune("icate"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
if 0 < measure(s[:lenS-lenSuffix]) {
result = result[:lenS-3]
}
} else if suffix := []rune("ative"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 0 < m {
result = subSlice
}
} else if suffix := []rune("alize"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
if 0 < measure(s[:lenS-lenSuffix]) {
result = result[:lenS-3]
}
} else if suffix := []rune("iciti"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
if 0 < measure(s[:lenS-lenSuffix]) {
result = result[:lenS-3]
}
} else if suffix := []rune("ical"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
if 0 < measure(s[:lenS-lenSuffix]) {
result = result[:lenS-2]
}
} else if suffix := []rune("ful"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 0 < m {
result = subSlice
}
} else if suffix := []rune("ness"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 0 < m {
result = subSlice
}
}
// Return.
return result
}
func step4(s []rune) []rune {
// Initialize.
lenS := len(s)
result := s
// Do it!
if suffix := []rune("al"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = result[:lenS-lenSuffix]
}
} else if suffix := []rune("ance"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = result[:lenS-lenSuffix]
}
} else if suffix := []rune("ence"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = result[:lenS-lenSuffix]
}
} else if suffix := []rune("er"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ic"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("able"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ible"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ant"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ement"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ment"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ent"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ion"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
c := subSlice[len(subSlice)-1]
if 1 < m && ('s' == c || 't' == c) {
result = subSlice
}
} else if suffix := []rune("ou"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ism"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ate"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("iti"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ous"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ive"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
} else if suffix := []rune("ize"); hasSuffix(s, suffix) {
lenSuffix := len(suffix)
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
}
// Return.
return result
}
func step5a(s []rune) []rune {
// Initialize.
lenS := len(s)
result := s
// Do it!
if 'e' == s[lenS-1] {
lenSuffix := 1
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
} else if 1 == m {
if c := subSlice[len(subSlice)-1]; !(hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) {
result = subSlice
}
}
}
// Return.
return result
}
func step5b(s []rune) []rune {
// Initialize.
lenS := len(s)
result := s
// Do it!
if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] {
lenSuffix := 1
subSlice := s[:lenS-lenSuffix]
m := measure(subSlice)
if 1 < m {
result = subSlice
}
}
// Return.
return result
}
func StemString(s string) string {
// Convert string to []rune
runeArr := []rune(s)
// Stem.
runeArr = Stem(runeArr)
// Convert []rune to string
str := string(runeArr)
// Return.
return str
}
func Stem(s []rune) []rune {
// Initialize.
lenS := len(s)
// Short circuit.
if 0 == lenS {
/////////// RETURN
return s
}
// Make all runes lowercase.
for i := 0; i < lenS; i++ {
s[i] = unicode.ToLower(s[i])
}
// Stem
result := StemWithoutLowerCasing(s)
// Return.
return result
}
func StemWithoutLowerCasing(s []rune) []rune {
// Initialize.
lenS := len(s)
// Words that are of length 2 or less is already stemmed.
// Don't do anything.
if 2 >= lenS {
/////////// RETURN
return s
}
// Stem
s = step1a(s)
s = step1b(s)
s = step1c(s)
s = step2(s)
s = step3(s)
s = step4(s)
s = step5a(s)
s = step5b(s)
// Return.
return s
}