Deutsch English Français Italiano |
<v9qbgh$1u7qe$1@dont-email.me> View for Bookmarking (what is this?) Look up another Usenet article |
Path: ...!eternal-september.org!feeder3.eternal-september.org!news.eternal-september.org!.POSTED!not-for-mail From: porkchop@invalid.foo (Mike Sanders) Newsgroups: comp.lang.awk Subject: (Long post) Metaphone Algorithm In AWK Date: Sat, 17 Aug 2024 14:18:58 -0000 (UTC) Organization: A noiseless patient Spider Lines: 195 Sender: Mike Sanders <busybox@sdf.org> Message-ID: <v9qbgh$1u7qe$1@dont-email.me> Injection-Date: Sat, 17 Aug 2024 16:18:58 +0200 (CEST) Injection-Info: dont-email.me; posting-host="a6140d2d5655154251ea3b3eb866d559"; logging-data="2039630"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX19l0fO4sJSROXEfwpUseMDg" User-Agent: tin/2.6.2-20221225 ("Pittyvaich") (NetBSD/9.3 (amd64)) Cancel-Lock: sha1:3fhAQ3zIgOPRzn2lPx023Iy8034= Bytes: 6775 Hi folks, hope you all are doing well. Please excuse long post, wanted to share this, some might find it handy given a certain context. Must run, I'm very behind in my work (hey I'm always running behind!) # metaphone.awk: Michael Sanders - 2024 # # example invocation: # # echo "texas taxes taxi" | awk -f metaphone.awk -v find=texas # # notes: # # ever notice when you search for (say): # # 'i went to the zu' # # & your chosen search engine suggests something like: # # 'did you mean i went to the zoo' # # the metaphone algorithm handles such cases pretty well actually... # # Metaphone is a phonetic algorithm, published by Lawrence Philips in # 1990, for indexing words by their English pronunciation. It # fundamentally improves on the Soundex algorithm by using information # about variations and inconsistencies in English spelling and # pronunciation to produce a more accurate encoding, which does a # better job of matching words and names which sound similar. # https://en.wikipedia.org/wiki/Metaphone # # english only (sorry) # # not extensively tested, nevertheless a solid start, if you # improve this code please share your results # # other implentations... # # gist: https://gist.github.com/Rostepher/b688f709587ac145a0b3 # # BASIC: http://aspell.net/metaphone/metaphone.basic # # C: http://aspell.net/metaphone/metaphone-kuhn.txt # check if a character is a vowel function isvowel(c, is_vowel) { is_vowel = c ~ /[AEIOU]/ return is_vowel } # add a character or string to the result array function phonize(s, result, p_idx, i) { for (i = 1; i <= length(s); i++) { result[p_idx++] = substr(s, i, 1) } return p_idx } # compute metaphone code function metaphone(word, max_phonemes, result, p_idx, w_idx, c) { w_idx = 1 p_idx = 1 while (w_idx <= length(word) && p_idx <= max_phonemes) { c = toupper(substr(word, w_idx, 1)) if (c == "B") { if (w_idx == 1 || toupper(substr(word, w_idx - 1, 1)) != "M") { p_idx = phonize("B", result, p_idx) } } else if (c == "C") { if (toupper(substr(word, w_idx + 1, 1)) == "I" && toupper(substr(word, w_idx + 2, 1)) ~ /[AO]/) { p_idx = phonize("SH", result, p_idx) } else if (toupper(substr(word, w_idx + 1, 1)) == "H") { p_idx = phonize("X", result, p_idx) w_idx++ } else { p_idx = phonize("K", result, p_idx) } } else if (c == "D") { if (toupper(substr(word, w_idx + 1, 1)) == "G" && toupper(substr(word, w_idx + 2, 1)) ~ /[EIY]/) { p_idx = phonize("J", result, p_idx) w_idx++ } else { p_idx = phonize("T", result, p_idx) } } else if (c == "G") { if (toupper(substr(word, w_idx + 1, 1)) == "H") { if (w_idx > 1 && toupper(substr(word, w_idx - 1, 1)) !~ /[BDH]/ && toupper(substr(word, w_idx - 2, 1)) != "H") { p_idx = phonize("F", result, p_idx) w_idx++ } } else if (toupper(substr(word, w_idx + 1, 1)) != "N" || toupper(substr(word, w_idx + 2, 1)) != "E") { p_idx = phonize("K", result, p_idx) } } else if (c == "H") { if (isvowel(toupper(substr(word, w_idx + 1, 1))) && toupper(substr(word, w_idx - 1, 1)) !~ /[CGPST]/) { p_idx = phonize("H", result, p_idx) } } else if (c == "K") { if (w_idx == 1 || toupper(substr(word, w_idx - 1, 1)) != "C") { p_idx = phonize("K", result, p_idx) } } else if (c == "P") { if (toupper(substr(word, w_idx + 1, 1)) == "H") { p_idx = phonize("F", result, p_idx) } else { p_idx = phonize("P", result, p_idx) } } else if (c == "Q") { p_idx = phonize("K", result, p_idx) } else if (c == "S") { if (toupper(substr(word, w_idx + 1, 1)) == "H") { p_idx = phonize("SH", result, p_idx) w_idx++ } else if (toupper(substr(word, w_idx + 1, 1)) == "C" && toupper(substr(word, w_idx + 2, 1)) == "H") { p_idx = phonize("X", result, p_idx) w_idx += 2 } else { p_idx = phonize("S", result, p_idx) } } else if (c == "T") { if (toupper(substr(word, w_idx + 1, 1)) == "I" && toupper(substr(word, w_idx + 2, 1)) ~ /[AO]/) { p_idx = phonize("SH", result, p_idx) } else if (toupper(substr(word, w_idx + 1, 1)) == "H") { p_idx = phonize("TH", result, p_idx) w_idx++ } else if (toupper(substr(word, w_idx + 1, 1)) != "C" || toupper(substr(word, w_idx + 2, 1)) != "H") { p_idx = phonize("T", result, p_idx) } } else if (c == "V") { p_idx = phonize("F", result, p_idx) } else if (c == "W" || c == "Y") { if (isvowel(toupper(substr(word, w_idx + 1, 1)))) { p_idx = phonize(c, result, p_idx) } } else if (c == "X") { p_idx = phonize("KS", result, p_idx) } else if (c == "Z") { p_idx = phonize("S", result, p_idx) } w_idx++ } if (p_idx > max_phonemes) p_idx = max_phonemes return substr(combine(result), 1, p_idx) } # combine array into a string function combine(array, result, i) { result = "" for (i in array) { result = result array[i] } return result } BEGIN { # store metaphone code for "find" variable find_code = metaphone(find, 4) } { # loop through all fields (words) in the input line for (x = 1; x <= NF; x++) { # compute metaphone code for the current word word_code = metaphone($x, 4) # check if metaphone code of the current word matches if (word_code == find_code) { ========== REMAINDER OF ARTICLE TRUNCATED ==========