| Deutsch English Français Italiano |
|
<lu8b6vFhjecU1@mid.individual.net> View for Bookmarking (what is this?) Look up another Usenet article |
Path: ...!news.roellig-ltd.de!open-news-network.org!weretis.net!feeder8.news.weretis.net!fu-berlin.de!uni-berlin.de!individual.net!not-for-mail
From: ted@loft.tnolan.com (Ted Nolan <tednolan>)
Newsgroups: comp.lang.tcl
Subject: Re: Tcl9: source files are interpreted as utf-8 by default
Date: 8 Jan 2025 22:06:24 GMT
Organization: loft
Lines: 250
Message-ID: <lu8b6vFhjecU1@mid.individual.net>
References: <vjhiar$3f9go$1@dont-email.me> <20250108162339.7c04023e@lud1.home> <vlmjs8$2tu2l$1@dont-email.me> <20250108172312.253b829c@lud1.home>
X-Trace: individual.net 2tpKOQqHAljVXJHcn7fs0gnR9qmqMD7Q+MJ4InieJvLEoPAs5N
X-Orig-Path: not-for-mail
Cancel-Lock: sha1:Tm2LTrsPsNAlT3TNGKer5rtQWGM= sha256:dU4cAaJG289/sFajoSRNQ42fQFG0miDNl440jWdIprY=
X-Newsreader: trn 4.0-test76 (Apr 2, 2001)
Bytes: 8223
In article <20250108172312.253b829c@lud1.home>, Luc <luc@sep.invalid> wrote:
>On Wed, 8 Jan 2025 19:32:24 -0000 (UTC), Rich wrote:
>
>>> Instead of main.tcl sourcing set_encoding.tcl, starter.tcl runs some
>>> 'encoding' command then sources main.tcl. Basically, a wrapper.
>>
>>Yes, that works. But then Uwe has to go and "wrapperize" all the
>>various scripts, on all the various client systems. So he's back in
>>the same boat of "major modifications need be made now" as changing all
>>the launching instances to launch with "-encoding iso-8859".
>
>True, but he has considered that kind of effort. His words:
>
>
>"That means we have to add "-encoding iso8859-1"
>to ALL source and ALL tclsh calls in ALL scripts.
>So far, so good(or bad?)."
>
>"What initially seems quite doable, looks more and more scary
>to me. First, if we ever may switch encoding to utf-8 we
>have to alter all those lines again."
>
>
>So in my mind, the "customer" accepts (though grudgingly) making
>large scale changes, but is concerned with possible new changes
>in the future. A wrapper can handle the future quite gracefully.
>
>
>>I've resisted pointing this one out, but long term, yes, updating all
>>the scripts to be utf-8 encoded is the right, long term, answer. But
>>that belies all the current, short term effort, involved in doing so.
>
>Actually, when I mentioned my migration case, I was also thinking that
>I could afford to do it because I was migrating to Linux and utf-8 was
>not even the future anymore, it was pretty much the present. But maybe
>running iconv wouldn't be acceptable because Uwe is (I assume) on
>Windows. Does a Windows user want to convert his files to utf-8?
>Won't that cause problems if the system is iso-8859-1? Windows still
>uses iso-8859-1, right?
>
>So yes, I guess Tcl9 causes trouble to 8859-1 users. Yes, sounds like
>it needs some fixing.
>
>More suggestions: how about not using Tcl9 just yet? I'm stil on 8.6
>and the water is fine. Early adopters tend to pay a price. In my case,
>absent packages.
>
>I have my own special case, I use Debian 9 which only ships 8.6.6 so
>I had to build 8.6.15 from source because I really need Unicode.
>But for some time I used Freewrap as a single-file batteries included
>Tcl/Tk interpreter. So maybe Uwe should just use a different interpreter,
>likely just a slightly older version of Tcl/Tk and embrace Tcl9 later.
>
>I wonder if one can hack the encoding issue on the Tcl9 source and
>rebuild it.
>
>
>--
>Luc
>>>
>
FWIW, could check if a source file is utf-8 easily enough. I wrote
a command to do that based on some code from the web a while ago and
it seemed to work OK for what I needed it for.
So read your suspect file in binary mode, call "string_is_utf" on it and
if it is, you're good to source it.
(If it isn't you can probably apply some more heuristics on the
string to guess what it actually is).
==
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <tcl.h>
#ifdef WIN32
#include <io.h>
#define TCL_API __declspec(dllexport)
#else
#include <unistd.h>
#define TCL_API
#endif
#ifdef WIN32
#define dup _dup
#define fileno _fileno
#define fdopen _fdopen
#define close _close
#endif
static char rcsid[] = "$Id$ TN";
/*
* Function prototypes
*/
TCL_API int Isutf_Init(Tcl_Interp *interp);
static int isutf_string_is_utf(ClientData clientData, Tcl_Interp *interp,
int objc, Tcl_Obj *CONST objv[]);
/*
* This decoder by Bjoern Hoermann is the simplest I've found. It also works
* by feeding it a single byte, as well as keeping a state. The state is
* very useful for parsing UTF8 coming in in chunks over the network.
*
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*
*/
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
#if 0
static uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state*16 + type];
return *state;
}
#endif
/*
*
* A simple validator/detector doesn't need the code point,
* so it could be written like this (Initial state is set to UTF8_ACCEPT):
*
*/
static uint32_t validate_utf8(uint32_t *state, unsigned char *str, size_t len) {
size_t i;
uint32_t type;
for (i = 0; i < len; i++) {
// We don't care about the codepoint, so this is
// a simplified version of the decode function.
type = utf8d[(uint8_t)str[i]];
*state = utf8d[256 + (*state) * 16 + type];
if (*state == UTF8_REJECT)
break;
}
return *state;
}
/*
* If the text is valid utf8 UTF8_ACCEPT is returned. If it's
* invalid UTF8_REJECT. If more data is needed, some other integer is returned.
*
*/
========== REMAINDER OF ARTICLE TRUNCATED ==========