Page 1 of 1

Decode encoded html text to UTF8

Posted: Sun Oct 02, 2011 9:12 pm
by Support
html does not allow unicode characters. They must be encoded first. This script will try to decode them back to original.

Example: convert %C3%A9 to é and also convert é to é

Code: Select all

SaveAsFileName = 'c:\test.html';

PerlRegEx = Yes;
Output.Clear;

function DecodeUTF8(txt);
begin
	Result = '';

	// This will convert %C3%A9 into é
	rx = New(RegEx);
	rx.Data = txt;
	rx.Mask = '(\%(c[2-3])\%([8-9a-b][0-9a-f]))';
	p = 1;
	while rx.Match do begin
		a = UpperCase(rx.Value[2]);
		a = Ord(a.SubStr[2,1])-Ord('0');
		a = (a - 2) * 64;
		b = UpperCase(rx.Value[3]);
		b1 = b.SubStr[1,1];
		if b1 ~= '[0-9]' then b1 = Ord(b1)-Ord('0') else b1 = Ord(b1)-Ord('A')+10;
		b2 = b.SubStr[2,1];
		if b2 ~= '[0-9]' then b2 = Ord(b2)-Ord('0') else b2 = Ord(b2)-Ord('A')+10;
		b = b1*16+b2-128;
		Result = Result + txt.SubStr[p, rx.Pos[1]-p] + Chr(128+a+b);
		p = rx.Pos[1] + rx.Len[1];
	end;
	Result = Result + txt.SubStr[p];

	// This will convert é into é
	z = 128;
	for a = 194 to 195 do begin
	  for b = 128 to 191 do begin
	    Result.Replace(Chr(a)+Chr(b), Chr(z));
	    z++;
	  end;
	end;

end;

Link = New(URL);
Link.Get('http://catalogue.proximus.be/PUB/gsmc_business/GSM_Catalog.jsp?language=fr');

f = New(File);
f.Open(SaveAsFileName);
f.Write(DecodeUTF8(Link.Data));
Display('Converted page save to...'+crlf+SaveAsFileName);