Deutsch English Français Italiano |
<va3k5u$3n2um$1@dont-email.me> View for Bookmarking (what is this?) Look up another Usenet article |
Path: ...!eternal-september.org!feeder3.eternal-september.org!news.eternal-september.org!.POSTED!not-for-mail From: porkchop@invalid.foo (Mike Sanders) Newsgroups: comp.lang.awk Subject: Re: (Long post) Metaphone Algorithm In AWK Date: Wed, 21 Aug 2024 02:42:07 -0000 (UTC) Organization: A noiseless patient Spider Lines: 193 Sender: Mike Sanders <busybox@sdf.org> Message-ID: <va3k5u$3n2um$1@dont-email.me> References: <v9qbgh$1u7qe$1@dont-email.me> Injection-Date: Wed, 21 Aug 2024 04:42:07 +0200 (CEST) Injection-Info: dont-email.me; posting-host="c07617ccf20e561b171d3b81aed9e688"; logging-data="3902422"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX1+uDHJ9Z5TFJAfhjK1OLI9I" User-Agent: tin/2.6.2-20221225 ("Pittyvaich") (NetBSD/9.3 (amd64)) Cancel-Lock: sha1:nQKB7FP47uD+ZeiTYeD7a161U9M= Bytes: 5645 just in case... not sure its wise to use 'm += var' with digits: m += string # valid m += "7" # may be invalid if its a digit (even if quoted) replaced all instances of: m += var with: m = m var any one know for sure? # Metaphone Algorithm In AWK v5: Michael Sanders - 2024 # # https://en.wikipedia.org/wiki/Metaphone # # example invocation: awk -f metaphone.awk -v find=butter < words.txt BEGIN { find_code = metaphone(find) } # ----------------------------------------------------------------- # emit metaphone codes only # { for (x = 1; x <= NF; x++) { print metaphone($x) }; exit } # tweek levenshtein distance to open/constrain results... { for (x = 1; x <= NF; x++) if (metaphone($x) == find_code && levenshtein($x, find) <= 2) print $x } # ----------------------------------------------------------------- function metaphone(w, m, c, n, z, i) { # convert the word to uppercase and strip non-alphabetic characters w = toupper(w) gsub(/[^A-Z]/, "", w) z = length(w) # handle initial letters if (substr(w, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) { w = substr(w, 2) z-- } for (i = 1; i <= z; i++) { c = substr(w, i, 1) n = (i < z) ? substr(w, i + 1, 1) : "" # skip duplicate letters except for 'C' if (i > 1 && c == substr(w, i - 1, 1) && c != "C") continue # handle vowels: retain only if it's 1st letter if (isvowel(c)) { if (i == 1) m = m c } # consonants else if (c == "B") { if (!(i == z && substr(w, i - 1, 1) == "M")) m = m "B" } else if (c == "C") { if (substr(w, i, 2) == "CH") { m = m "X" i++ } else if (substr(w, i, 2) ~ /^(CI|CE|CY)/) { m = m "S" } else { m = m "K" } } else if (c == "D") { if (substr(w, i, 2) == "DG" && isvowel(substr(w, i + 2, 1))) { m = m "J" i += 2 } else { m = m "T" } } else if (c == "G") { if (substr(w, i, 2) == "GH" && (i == 1 || !isvowel(substr(w, i - 1, 1)))) { i++ } else if (substr(w, i, 2) == "GN" || (i == z && c == "G")) { continue } else if (substr(w, i, 3) ~ /^(GIA|GIE|GEY)/) { m = m "J" } else { m = m "K" } } else if (c == "H") { if (i == 1 || substr(w, i - 1, 1) !~ /[CSPTG]/) { if (i < z && !isvowel(n)) { m = m "H" } } } else if (c == "K") { if (i == 1 || substr(w, i - 1, 1) != "C") m = m "K" } else if (c == "P") { if (substr(w, i, 2) == "PH") { m = m "F" i++ } else { m = m "P" } } else if (c == "Q") { m = m "K" } else if (c == "S") { if (substr(w, i, 2) == "SH") { m = m "X" i++ } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") { m = m "X" i += 2 } else { m = m "S" } } else if (c == "T") { if (substr(w, i, 2) == "TH") { m = m "0" # add '0' for 'TH' digraph to distinguish from regular 'T' i++ } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") { m = m "X" i += 2 } else { m = m "T" } } else if (c == "V") { m = m "F" } else if (c == "W" || c == "Y") { if (i < z && isvowel(n)) m = m c } else if (c == "X") { m = m "KS" } else if (c == "Z") { m = m "S" } # ensure 'M', 'N', and 'L' are always retained else if (c == "M" || c == "N" || c == "L") { m = m c } } return m } # ----------------------------------------------------------------- function levenshtein(w1, w2, l1, l2, i, j, cst, diz) { l1 = length(w1) l2 = length(w2) # initialize distance array for (i = 0; i <= l1; i++) diz[i, 0] = i for (j = 0; j <= l2; j++) diz[0, j] = j # compute distance for (i = 1; i <= l1; i++) { for (j = 1; j <= l2; j++) { cst = (substr(w1, i, 1) == substr(w2, j, 1)) ? 0 : 1 diz[i, j] = min3(diz[i-1, j] + 1, # deletion diz[i, j-1] + 1, # insertion diz[i-1, j-1] + cst) # substitution } } return diz[l1, l2] } # ----------------------------------------------------------------- # metaphone helper function function isvowel(char) { return char ~ /[AEIOU]/ } # ----------------------------------------------------------------- ========== REMAINDER OF ARTICLE TRUNCATED ==========