Deutsch   English   Français   Italiano  
<va3k5u$3n2um$1@dont-email.me>

View for Bookmarking (what is this?)
Look up another Usenet article

Path: ...!eternal-september.org!feeder3.eternal-september.org!news.eternal-september.org!.POSTED!not-for-mail
From: porkchop@invalid.foo (Mike Sanders)
Newsgroups: comp.lang.awk
Subject: Re: (Long post) Metaphone Algorithm In AWK
Date: Wed, 21 Aug 2024 02:42:07 -0000 (UTC)
Organization: A noiseless patient Spider
Lines: 193
Sender: Mike Sanders <busybox@sdf.org>
Message-ID: <va3k5u$3n2um$1@dont-email.me>
References: <v9qbgh$1u7qe$1@dont-email.me>
Injection-Date: Wed, 21 Aug 2024 04:42:07 +0200 (CEST)
Injection-Info: dont-email.me; posting-host="c07617ccf20e561b171d3b81aed9e688";
	logging-data="3902422"; mail-complaints-to="abuse@eternal-september.org";	posting-account="U2FsdGVkX1+uDHJ9Z5TFJAfhjK1OLI9I"
User-Agent: tin/2.6.2-20221225 ("Pittyvaich") (NetBSD/9.3 (amd64))
Cancel-Lock: sha1:nQKB7FP47uD+ZeiTYeD7a161U9M=
Bytes: 5645

just in case...

not sure its wise to use 'm += var' with digits:

m += string # valid
m += "7"    # may be invalid if its a digit (even if quoted)

replaced all instances of: m += var
with: m = m var

any one know for sure?

# Metaphone Algorithm In AWK v5: Michael Sanders - 2024
#
# https://en.wikipedia.org/wiki/Metaphone
#
# example invocation: awk -f metaphone.awk -v find=butter < words.txt

BEGIN { find_code = metaphone(find) }

# -----------------------------------------------------------------

# emit metaphone codes only
# { for (x = 1; x <= NF; x++) { print metaphone($x) }; exit }

# tweek levenshtein distance to open/constrain results...
{
for (x = 1; x <= NF; x++)
   if (metaphone($x) == find_code && levenshtein($x, find) <= 2)
      print $x
}

# -----------------------------------------------------------------

function metaphone(w, m, c, n, z, i) {
  # convert the word to uppercase and strip non-alphabetic characters
  w = toupper(w)
  gsub(/[^A-Z]/, "", w)
  z = length(w)

  # handle initial letters
  if (substr(w, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) {
    w = substr(w, 2)
    z--
  }

  for (i = 1; i <= z; i++) {
    c = substr(w, i, 1)
    n = (i < z) ? substr(w, i + 1, 1) : ""

    # skip duplicate letters except for 'C'
    if (i > 1 && c == substr(w, i - 1, 1) && c != "C") continue

    # handle vowels: retain only if it's 1st letter
    if (isvowel(c)) {
      if (i == 1) m = m c
    }
    # consonants
    else if (c == "B") {
      if (!(i == z && substr(w, i - 1, 1) == "M")) m = m "B"
    }
    else if (c == "C") {
      if (substr(w, i, 2) == "CH") {
        m = m "X"
        i++
      } else if (substr(w, i, 2) ~ /^(CI|CE|CY)/) {
        m = m "S"
      } else {
        m = m "K"
      }
    }
    else if (c == "D") {
      if (substr(w, i, 2) == "DG" && isvowel(substr(w, i + 2, 1))) {
        m = m "J"
        i += 2
      } else {
        m = m "T"
      }
    }
    else if (c == "G") {
      if (substr(w, i, 2) == "GH" && (i == 1 || !isvowel(substr(w, i - 1, 1)))) {
        i++
      } else if (substr(w, i, 2) == "GN" || (i == z && c == "G")) {
        continue
      } else if (substr(w, i, 3) ~ /^(GIA|GIE|GEY)/) {
        m = m "J"
      } else {
        m = m "K"
      }
    }
    else if (c == "H") {
      if (i == 1 || substr(w, i - 1, 1) !~ /[CSPTG]/) {
        if (i < z && !isvowel(n)) {
          m = m "H"
        }
      }
    }
    else if (c == "K") {
      if (i == 1 || substr(w, i - 1, 1) != "C") m = m "K"
    }
    else if (c == "P") {
      if (substr(w, i, 2) == "PH") {
        m = m "F"
        i++
      } else {
        m = m "P"
      }
    }
    else if (c == "Q") {
      m = m "K"
    }
    else if (c == "S") {
      if (substr(w, i, 2) == "SH") {
        m = m "X"
        i++
      } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
        m = m "X"
        i += 2
      } else {
        m = m "S"
      }
    }
    else if (c == "T") {
      if (substr(w, i, 2) == "TH") {
        m = m "0" # add '0' for 'TH' digraph to distinguish from regular 'T'
        i++
      } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
        m = m "X"
        i += 2
      } else {
        m = m "T"
      }
    }
    else if (c == "V") {
      m = m "F"
    }
    else if (c == "W" || c == "Y") {
      if (i < z && isvowel(n)) m = m c
    }
    else if (c == "X") {
      m = m "KS"
    }
    else if (c == "Z") {
      m = m "S"
    }
    # ensure 'M', 'N', and 'L' are always retained
    else if (c == "M" || c == "N" || c == "L") {
      m = m c
    }
  }

  return m
}

# -----------------------------------------------------------------

function levenshtein(w1, w2, l1, l2, i, j, cst, diz) {
  l1 = length(w1)
  l2 = length(w2)

  # initialize distance array
  for (i = 0; i <= l1; i++) diz[i, 0] = i
  for (j = 0; j <= l2; j++) diz[0, j] = j

  # compute distance
  for (i = 1; i <= l1; i++) {
    for (j = 1; j <= l2; j++) {
      cst = (substr(w1, i, 1) == substr(w2, j, 1)) ? 0 : 1
      diz[i, j] = min3(diz[i-1, j] + 1,     # deletion
                       diz[i, j-1] + 1,     # insertion
                       diz[i-1, j-1] + cst) # substitution
    }
  }

  return diz[l1, l2]
}

# -----------------------------------------------------------------

# metaphone helper function
function isvowel(char) { return char ~ /[AEIOU]/ }

# -----------------------------------------------------------------
========== REMAINDER OF ARTICLE TRUNCATED ==========