Mi blog lah! Το ιστολόγιό μου

20Jun/084

Converting between XKB and XML

I completed the stage that takes keyboard layout files from XKB (X.Org) and converts them to XML documents, based on a keyboard layout Relax NG schema. Then, these XML documents can also be converted back to keyboard layout files.

Here is an imaginary example of a keyboard layout file.

// Keyboard layout for the Zzurope country (code: zz).
// Yeah.

partial alphanumeric_keys alternate_group hidden
xkb_symbols "bare" {
   key <AE01> { [        1, exclam,      onesuperior,  exclamdown      ] };
};

partial alphanumeric_keys alternate_group
xkb_symbols "basic" {
   name[Group1] = "ZZurope";

   include "zz(bare)"

   key <AD04> { [        r, R,           ediaeresis,   Ediaeresis      ] };
   key <AC07> { [        j, J,           idiaeresis,   Idiaeresis      ] };
   key <AB02> { [        x, X,           oe,           OE              ] };
   key <AB04> { [        v, V,           registered,   registered      ] };
};

partial alphanumeric_keys alternate_group
xkb_symbols "extended" {
    include "zz(basic)"
    name[Group1] = "ZZurope Extended";
    key.type = "THREE_LEVEL"; // We use three levels.
    override key <AD01> {   type[Group1] = "SEPARATE_CAPS_AND_SHIFT_ALPHABETIC",
[ U1C9, U1C8], [  any,   U1C7 ]   }; // q
    override key <AD02> {   [ U1CC, U1CB, any,U1CA ],
type[Group1] = "SEPARATE_CAPS_AND_SHIFT_ALPHABETIC" }; // w
    key <BKSP> {
        type[Group1]="CTRL+ALT",
        symbols[Group1]= [ BackSpace,   Terminate_Server ]
    };
    key <BKSR> { virtualMods = AltGr, [ 1, 2 ] };
    modifier_map Control { Control_L };
    modifier_map Mod5   { <LVL3>, <MDSW> };
    key <BKST> { [1, 2,3, 4] };
};

When converted to an XML document, it looks like

<?xml version="1.0" encoding="UTF-8"?>
<layout layoutname="zz">
  <symbols>
    <mapoption>hidden</mapoption>
    <mapoption>xkb_symbols</mapoption>
    <mapname>bare</mapname>
    <mapmaterial>
      <tokenkey override="False">
        <keycodename>AE01</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>1</symbol>
            <symbol>exclam</symbol>
            <symbol>onesuperior</symbol>
            <symbol>exclamdown</symbol>
          </symbolsgroup>
        </keysymgroup>
      </tokenkey>
    </mapmaterial>
  </symbols>
  <symbols>
    <mapoption>xkb_symbols</mapoption>
    <mapname>basic</mapname>
    <mapmaterial>
      <tokenname name="ZZurope"/>
      <tokeninclude>zz(bare)</tokeninclude>
      <tokenkey override="False">
        <keycodename>AD04</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>r</symbol>
            <symbol>R</symbol>
            <symbol>ediaeresis</symbol>
            <symbol>Ediaeresis</symbol>
          </symbolsgroup>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="False">
        <keycodename>AC07</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>j</symbol>
            <symbol>J</symbol>
            <symbol>idiaeresis</symbol>
            <symbol>Idiaeresis</symbol>
          </symbolsgroup>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="False">
        <keycodename>AB02</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>x</symbol>
            <symbol>X</symbol>
            <symbol>oe</symbol>
            <symbol>OE</symbol>
          </symbolsgroup>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="False">
        <keycodename>AB04</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>v</symbol>
            <symbol>V</symbol>
            <symbol>registered</symbol>
            <symbol>registered</symbol>
          </symbolsgroup>
        </keysymgroup>
      </tokenkey>
    </mapmaterial>
  </symbols>
  <symbols>
    <mapoption>xkb_symbols</mapoption>
    <mapname>extended</mapname>
    <mapmaterial>
      <tokenname name="ZZurope Extended"/>
      <tokeninclude>zz(basic)</tokeninclude>
      <tokentype>THREE_LEVEL</tokentype>
      <tokenmodifiermap state="Control">
        <keycode value="Control_L"/>
      </tokenmodifiermap>
      <tokenmodifiermap state="Mod5">
        <keycodex value="LVL3"/>
        <keycodex value="MDSW"/>
      </tokenmodifiermap>
      <tokenkey override="True">
        <keycodename>AD01</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>U1C9</symbol>
            <symbol>U1C8</symbol>
          </symbolsgroup>
          <symbolsgroup>
            <symbol>any</symbol>
            <symbol>U1C7</symbol>
          </symbolsgroup>
          <typegroup value="SEPARATE_CAPS_AND_SHIFT_ALPHABETIC"/>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="True">
        <keycodename>AD02</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>U1CC</symbol>
            <symbol>U1CB</symbol>
            <symbol>any</symbol>
            <symbol>U1CA</symbol>
          </symbolsgroup>
          <typegroup value="SEPARATE_CAPS_AND_SHIFT_ALPHABETIC"/>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="False">
        <keycodename>BKSP</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>BackSpace</symbol>
            <symbol>Terminate_Server</symbol>
          </symbolsgroup>
          <typegroup value="CTRL+ALT"/>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="False">
        <keycodename>BKSR</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>1</symbol>
            <symbol>2</symbol>
          </symbolsgroup>
          <tokenvirtualmodifiers value="AltGr"/>
        </keysymgroup>
      </tokenkey>
      <tokenkey override="False">
        <keycodename>BKST</keycodename>
        <keysymgroup>
          <symbolsgroup>
            <symbol>1</symbol>
            <symbol>2</symbol>
            <symbol>3</symbol>
            <symbol>4</symbol>
          </symbolsgroup>
        </keysymgroup>
      </tokenkey>
    </mapmaterial>
  </symbols>
</layout>

When we convert the XML document back to the XKB format, it looks like

hidden xkb_symbols "bare"
{
	key <AE01> { [ 1, exclam, onesuperior, exclamdown ] };
};

xkb_symbols "basic"
{
	name = "ZZurope";
	include "zz(bare)"
	key <AD04> { [ r, R, ediaeresis, Ediaeresis ] };
	key <AC07> { [ j, J, idiaeresis, Idiaeresis ] };
	key <AB02> { [ x, X, oe, OE ] };
	key <AB04> { [ v, V, registered, registered ] };
};

xkb_symbols "extended"
{
	name = "ZZurope Extended";
	include "zz(basic)"
	key.type = "THREE_LEVEL";
	modifier_map Control { Control_L };
	modifier_map Mod5 { <LVL3>, <MDSW> };
	override key <AD01> { [ U1C9, U1C8 ], [ any, U1C7 ], type = "SEPARATE_CAPS_AND_SHIFT_ALPHABETIC"  };
	override key <AD02> { [ U1CC, U1CB, any, U1CA ], type = "SEPARATE_CAPS_AND_SHIFT_ALPHABETIC"  };
	key <BKSP> { [ BackSpace, Terminate_Server ], type = "CTRL+ALT"  };
	key <BKSR> { [ 1, 2 ], virtualMods = AltGr  };
	key <BKST> { [ 1, 2, 3, 4 ] };
};

Some things are missing such as partial, alphanumeric_keys and alternate_group, which I discussed with Sergey and he said they should be ok to go away.

In addition, we simplify by keeping just Group1 (we do not specify it, as it is implied).

I performed the round-trip with all layout files, and all parsed and validated OK (there is some extra work with the level3 file remaining, though).

Some issues that are remaining, include

  • Figuring out how to use XLink to link to documents in the same folder (+providing a parameter; the name of the variant), and how to represent that in the Relax NG schema.
  • Sort the layout entries by keycode value.
14May/0816

Should UI strings in source code have non-ASCII characters?

There is a discussion going on at desktop-devel about whether the UI strings in the source code should also have non-ASCII characters. For example, should typical strings with double-quotes have those fancy Unicode double quotes?

printf(_("Could not find file “%s”n"));

instead of

printf(_("Could not find file "%s"n"));

The general view from the replies is to go ahead and add those nice Unicode characters.

Actually, there are UI messages already with non-ASCII characters (the ellipsis character, …) in GNOME 2.22:

  1. glade3
  2. epiphany

In GNOME 2.24, there are even more (with ellipsis):

  1. gucharmap
  2. epiphany
  3. gnome-terminal
  4. gedit
  5. glade3

Regarding the fancy Unicode double quotes, there are UI strings in GNOME 2.22 (same list for 2.24) in the following packages:

  1. evince
  2. cheese
  3. epiphany
  4. eog
  5. gnome-doc-utils

What are the arguments against having non-ASCII characters in UI strings?

  1. There might be systems that still use 8-bit legacy encodings. In this case, the UTF-8 encoded may not be displayed properly. However, when I tried to demonstrate this on my system (Ubuntu 8.04), I failed miserably. I downloaded a small GTK2 text editor (called tea), I changed a source UI string to include “” and ellipsis, compiled and installed. I then opened a shell, set LANG to POSIX (or C), and ran the text editor. The UI message was proper Unicode and I could even type non-ASCII in the text editor. I resorted to changing a system locale (I picked en_IN) to ISO-8859-1, then logged out. In the login screen it did not show the 8-bit encoding. If someone has a proper legacy 8-bit encoding system with GNOME (OpenBSD, FreeBSD, etc), could you please try it out?
  2. As Alan Cox mentioned in the thread, the canonical way to deal with UI strings in the source code should be to keep as ASCII, and put any fancy Unicode characters in the translation files (even for en_US, get an en_US translation file).

Is GNOME (or components) used in a legacy 7-bit/8-bit environment?

If there is any reason to keep UI strings in the source code as plain ASCII, speak now, or the Unicode flood gates are about to open.

Update 16 May 2008:There is a document at the ISO/IEC 9899 website (C programming language), that mentions the issue of character sets in C. It is http://www.open-std.org/jtc1/sc22/wg14/www/docs/C99RationaleV5.10.pdf.

On page 26, section 5.2.1, it says

The C89 Committee ultimately came to remarkable unanimity on the subject of character set requirements. There was strong sentiment that C should not be tied to ASCII, despite its heritage and despite the precedent of Ada being defined in terms of ASCII. Rather, an implementation is required to provide a unique character code for each of the printable graphics used by C, and for each of the control codes representable by an escape sequence. (No particular graphic representation for any character is prescribed; thus the common Japanese practice of using the glyph “¥” for the C character “” is perfectly legitimate.) Translation and execution environments may have different character sets, but each must meet this requirement in its own way. The goal is to ensure that a conforming implementation can translate a C translator written in C.

For this reason, and for economy of description, source code is described as if it undergoes the same translation as text that is input by the standard library I/O routines: each line is terminated by some newline character regardless of its external representation.

With the concept of multibyte characters, “native” characters could be used in string literals and character constants, but this use was very dependent on the implementation and did not usually work in heterogenous environments. Also, this did not encompass identifiers.

It then goes on with an addition to C99:

A new feature of C99: C99 adds the concept of universal character name (UCN) (see §6.4.3) in order to allow the use of any character in a C source, not just English characters. The primary goal of the Committee was to enable the use of any “native” character in identifiers, string literals and character constants, while retaining the portability objective of C.

Both the C and C++ committees studied this situation, and the adopted solution was to introduce a new notation for UCNs. Its general forms are unnnn and Unnnnnnnn, to designate a given character according to its short name as described by ISO/IEC 10646. Thus, unnnn can be used to designate a Unicode character. This way, programs that must be fully portable may use virtually any character from any script used in the world and still be portable, provided of course that if it prints the character, the execution character set has representation for it.

Of course the notation unnnn, like trigraphs, is not very easy to use in everyday programming; so there is a mapping that links UCN and multibyte characters to enable source programs to stay readable by users while maintaining portability. Given the current state of multibyte encodings,
10 this mapping is specified to be implementation-defined; but an implementation can provide the users with utility programs that do the conversion from UCNs to “native” multibytes or vice versa, thus providing a way to exchange source files between implementations using the UCN notation.

Update 7 Aug 2008: According to PEP 8, Style Guide for Python Code, under Encodings, says

    For Python 3.0 and beyond, the following policy is prescribed for
    the standard library (see PEP 3131): All identifiers in the Python
    standard library MUST use ASCII-only identifiers, and SHOULD use
    English words wherever feasible (in many cases, abbreviations and
    technical terms are used which aren't English). In addition,
    string literals and comments must also be in ASCII. The only
    exceptions are (a) test cases testing the non-ASCII features, and
    (b) names of authors. Authors whose names are not based on the
    latin alphabet MUST provide a latin transliteration of their
    names.

    Open source projects with a global audience are encouraged to
    adopt a similar policy.

(Emphasis mine)

11Nov/070

Localisation issues in home directory folders (xdg-user-dirs)

In new distributions such as Ubuntu 7.10 there is now support for folder names of personal data in your local language. What this means is that ~/Desktop can now be called ~/Επιφάνεια εργασίας. You also get a few more default folders, including ~/Music, ~/Documents, ~/Pictures and so on.

This functionality of localised home folders has become available thanks to a new FreeDesktop standard, XDG-USER-DIRS. xdg-user-dirs can be localised, and the current localisations are available at xdg-user-dirs/po.

A potential issue arises when a user logs in with different locales; how does the system switch between the localised versions of the folder names? For GNOME there is a migration tool; as soon as you login into your account with a different locale, the system will prompt whether you wish to switch the names from one language to another. This is available through the xdg-user-dirs-gtk application.

Another issue is with users who use the command line quite often; switching between two languages (for those languages that use a script other than latin) tends to become cumbersome, especially if you have not setup your shell for intelligent completion. In addition, when you connect remotely using SSH, you may not be able to type in the local language at the initial computer which would make work very annoying.

Furthermore, there have been reports with KDE applications not working; if someone can bug report it and post the link it would be great. The impression I got was that some installations of KDE did not read off the filesystem in UTF-8 but in a legacy 8-bit encoding. This requires further investigation.

Moreover, OpenOffice.org requires some integration work to follow the xdg-user-dirs standard; apparently it has its own option as to which folder it will save into any newly created files. I believe this will be resolved in the near future.

Now, if we just installed Ubuntu 7.10 or Fedora 8, and we got, by default, localised subfolders in our home directory (which we may not prefer), what can we do to revert to non-localised folders?

The lazy way is to logout, choose an English locale as the default locale for the system and log in. You will be presented with the xdg-user-dirs-gtk migration tool (shown above) that will give you the option to switch to English folder names for those personal folders.

Clarification: It is implied for this workaround (logout and login thing), you then log out again, set the language to the localised one (i.e. Greek) and log in. This time, when the system asks to rename the personal folders, you simply answer no, and you end up with a localised desktop but personal folders in English. Mission really accomplished.

If you are of the tinkering type, the files to change manually are

$ cat ~/.config/user-dirs.locale

el_GR

$

and

$ cat ~/.config/user-dirs.dirs

# This file is written by xdg-user-dirs-update
# If you want to change or add directories, just edit the line you're
# interested in. All local changes will be retained on the next run
# Format is XDG_xxx_DIR="$HOME/yyy", where yyy is a shell-escaped
# homedir-relative path, or XDG_xxx_DIR="/yyy", where /yyy is an
# absolute path. No other format is supported.
#
XDG_DESKTOP_DIR="$HOME/Επιφάνεια εργασίας"
XDG_DOWNLOAD_DIR="$HOME/Επιφάνεια εργασίας"
XDG_TEMPLATES_DIR="$HOME/Πρότυπα"
XDG_PUBLICSHARE_DIR="$HOME/δημόσιο"
XDG_DOCUMENTS_DIR="$HOME/Έγγραφα"
XDG_MUSIC_DIR="$HOME/Μουσική"
XDG_PICTURES_DIR="$HOME/Εικόνες"
XDG_VIDEOS_DIR="$HOME/Βίντεο"

Personally I believe that having localised names appear under the home folder is good for the majority of users, as they will be able to match what is shown in Locations with the actual names on the filesystem.

There will be cases that software has to be updated and bugs fixed (such as in backup tools). As we proceed with more advanced internationalisation/localisation support in Linux, it is desirable to follow forward, and fix problematic software.

However, if enough popular support arises with clear arguments (am referring to Greek-speaking users and a current discussion) for default folder names in the English languages, we could follow the popular demand.

Also see the relevant blog post New Dirs in Gutsy: Documents, Music, Pictures, Blah, Blah by Moving to Freedom.

29Mar/070

Convert your legacy font to Unicode

There exist quite a few legacy fonts, from the time that 8-bit-style encodings was the norm. Nowdays, most (if not all) spoken and ancient scripts have been added to the Unicode standard.
Therefore, if you have a legacy font, you can convert to Unicode using a guide by William J Poser. The guide uses Linear B as an example.

The program mentioned in the guide is pfaedit, which is now known as FontForge. FontForge is available in your Ubuntu distribution; simply search using the package manager.

Once you have a Unicode font, the next step is to prepare an input method so that you can write in this script. But that's another blog post.

9Jan/070

Creating a new locale on the OLPC

When you run the OLPC software you currently have access only to the English locales.

If you want to enable Greek support, you need to run (as root)

localedef -v -c -i /usr/share/i18n/locales/el_GR -f UTF-8 /usr/lib/locale/el_GR/

localedef -v -c -i /usr/share/i18n/locales/el_GR -f UTF-8 /usr/lib/locale/el_GR.utf8/

You will get a bunch of warnings. You can ignore them for now.

The localedef command compiles the source locale information found at /usr/share/i18n/locales/el_GR and places the resulting files at
/usr/lib/locale/el_GR/ and /usr/lib/locale/el_GR.utf8/ (both directories contain the same files, so you can also make a link from one to another). The reason we make two versions is that we can use either el_GR or el_GR.utf8 in the applications. Both use UTF-8 as the base encoding which is always nice.
For other locales, replace el_GR with the locale name of your country.

To activate the Greek locale, you need to create a file /etc/sysconfig/i18n and add the text

LANG=el_GR.utf8

LANGUAGE=el:en

Now you need to place the translated applications (.mo format) into

/usr/share/locale/el/LC_MESSAGES/

and restart your virtual machine (or laptop (hint hint)).

6Jul/06Off

Multimedia support in Ubuntu Linux 6.06

With Ubuntu Linux 6.06, it is much clear how to install those codecs in order to get broad multimedia file support.

In Ubuntu, the multimedia infrastructure is handled by GStreamer; you install GStreamer plugins and any application that uses GStreamer can immediately benefit from the new codec support.

A typical installation of Ubuntu will bring in the free and open-source codecs by default. This includes the base gstreamer plugins package, gstreamer0.10-plugins-base that covers

  1. /usr/lib/gstreamer-0.10/libgstadder.so
  2. /usr/lib/gstreamer-0.10/libgstaudioconvert.so
  3. /usr/lib/gstreamer-0.10/libgstaudiorate.so
  4. /usr/lib/gstreamer-0.10/libgstaudioresample.so
  5. /usr/lib/gstreamer-0.10/libgstaudiotestsrc.so
  6. /usr/lib/gstreamer-0.10/libgstcdparanoia.so
  7. /usr/lib/gstreamer-0.10/libgstdecodebin.so
  8. /usr/lib/gstreamer-0.10/libgstffmpegcolorspace.so
  9. /usr/lib/gstreamer-0.10/libgstogg.so
  10. /usr/lib/gstreamer-0.10/libgstplaybin.so
  11. /usr/lib/gstreamer-0.10/libgstsubparse.so
  12. /usr/lib/gstreamer-0.10/libgsttcp.so
  13. /usr/lib/gstreamer-0.10/libgsttheora.so
  14. /usr/lib/gstreamer-0.10/libgsttypefindfunctions.so
  15. /usr/lib/gstreamer-0.10/libgstvideo4linux.so
  16. /usr/lib/gstreamer-0.10/libgstvideorate.so
  17. /usr/lib/gstreamer-0.10/libgstvideoscale.so
  18. /usr/lib/gstreamer-0.10/libgstvideotestsrc.so
  19. /usr/lib/gstreamer-0.10/libgstvolume.so
  20. /usr/lib/gstreamer-0.10/libgstvorbis.so

With a properly encoded multimedia file, you can play music or video with subtitles. Such good codecs are Ogg, Vorbis and Theora. You can also rip CDs; cdparanoia is also there.
By default you also get the good package, gstreamer0.10-plugins-good
It contains

  1. /usr/lib/gstreamer-0.10/libgst1394.so
  2. /usr/lib/gstreamer-0.10/libgstaasink.so
  3. /usr/lib/gstreamer-0.10/libgstalaw.so
  4. /usr/lib/gstreamer-0.10/libgstalpha.so
  5. /usr/lib/gstreamer-0.10/libgstapetag.so
  6. /usr/lib/gstreamer-0.10/libgstavi.so
  7. /usr/lib/gstreamer-0.10/libgstautodetect.so
  8. /usr/lib/gstreamer-0.10/libgstcacasink.so
  9. /usr/lib/gstreamer-0.10/libgstcdio.so
  10. /usr/lib/gstreamer-0.10/libgsteffectv.so
  11. /usr/lib/gstreamer-0.10/libgstgoom.so
  12. /usr/lib/gstreamer-0.10/libgstid3demux.so
  13. /usr/lib/gstreamer-0.10/libgstlevel.so
  14. /usr/lib/gstreamer-0.10/libgstefence.so
  15. /usr/lib/gstreamer-0.10/libgstmulaw.so
  16. /usr/lib/gstreamer-0.10/libgstossaudio.so
  17. /usr/lib/gstreamer-0.10/libgstrtp.so
  18. /usr/lib/gstreamer-0.10/libgstrtsp.so
  19. /usr/lib/gstreamer-0.10/libgstsmpte.so
  20. /usr/lib/gstreamer-0.10/libgsttaglib.so
  21. /usr/lib/gstreamer-0.10/libgstudp.so
  22. /usr/lib/gstreamer-0.10/libgstvideobox.so
  23. /usr/lib/gstreamer-0.10/libgstvideoflip.so
  24. /usr/lib/gstreamer-0.10/libgstwavenc.so
  25. /usr/lib/gstreamer-0.10/libgstwavparse.so
  26. /usr/lib/gstreamer-0.10/libgstauparse.so
  27. /usr/lib/gstreamer-0.10/libgstdebug.so
  28. /usr/lib/gstreamer-0.10/libgstnavigationtest.so
  29. /usr/lib/gstreamer-0.10/libgstalphacolor.so
  30. /usr/lib/gstreamer-0.10/libgstcairo.so
  31. /usr/lib/gstreamer-0.10/libgstflxdec.so
  32. /usr/lib/gstreamer-0.10/libgstmatroska.so
  33. /usr/lib/gstreamer-0.10/libgstvideomixer.so
  34. /usr/lib/gstreamer-0.10/libgstcutter.so
  35. /usr/lib/gstreamer-0.10/libgstmultipart.so
  36. /usr/lib/gstreamer-0.10/libgstflac.so
  37. /usr/lib/gstreamer-0.10/libgstjpeg.so
  38. /usr/lib/gstreamer-0.10/libgstpng.so
  39. /usr/lib/gstreamer-0.10/libgstspeex.so
  40. /usr/lib/gstreamer-0.10/libgstgconfelements.so
  41. /usr/lib/gstreamer-0.10/libgstshout2.so
  42. /usr/lib/gstreamer-0.10/libgstvideobalance.so
  43. /usr/lib/gstreamer-0.10/libgsticydemux.so
  44. /usr/lib/gstreamer-0.10/libgstximagesrc.so
  45. /usr/lib/gstreamer-0.10/libgstannodex.so
  46. /usr/lib/gstreamer-0.10/libgstgdkpixbuf.so
  47. /usr/lib/gstreamer-0.10/libgsthalelements.so
  48. /usr/lib/gstreamer-0.10/libgstdv.so

This includes generic AVI support, access to digital video and Firewire devices, visualisers, the Matroska codec, access to shoutcast servers, the speex audio codec, the flac codec and many more.

At this point, you can install Pitivi, a gstreamer-enabled video editor written in Python that helps you create your own movie. Make sure you install gstreamer0.10-gnonlin which enables non-linear editing in gstreamer.

Up to here you got free and open-source software.

You can continue with more codecs by installing the package gstreamer0.10-plugins-ugly. This package is not part of the official Ubuntu distribution; you need to enable the Universe repository. Use System/Administration/Synaptic Package Manager to install these additional packages.
Ugly are the plugins and codecs that may have distribution problems in some countries.

Ugly includes

  1. /usr/lib/gstreamer-0.10/libgsta52dec.so
  2. /usr/lib/gstreamer-0.10/libgstasf.so
  3. /usr/lib/gstreamer-0.10/libgstdvdlpcmdec.so
  4. /usr/lib/gstreamer-0.10/libgstdvdread.so
  5. /usr/lib/gstreamer-0.10/libgstdvdsub.so
  6. /usr/lib/gstreamer-0.10/libgstiec958.so
  7. /usr/lib/gstreamer-0.10/libgstmad.so
  8. /usr/lib/gstreamer-0.10/libgstmpeg2dec.so
  9. /usr/lib/gstreamer-0.10/libgstmpegaudioparse.so
  10. /usr/lib/gstreamer-0.10/libgstmpegstream.so
  11. /usr/lib/gstreamer-0.10/libgstrmdemux.so
  12. /usr/lib/gstreamer-0.10/libgstsid.so

This package will bring in, among others, DVD playback and subtitle support, ASF file support, MP3 support (MAD package) and MPEG2 video playback.
You can also get MP3 support if you install the gstreamer0.10-fluendo-mp3 plugin which is available from Universe as well. This package is probably free to use in any country thanks to the efforts of the Fluendo team.

It appears that if you install ugly, it is good to install gstreamer0.10-ffmpeg so that you get support for

FFmpeg plugin for GStreamer

This GStreamer plugin supports a large number of audio and video compression
formats through the use of the FFmpeg library. The plugin contains GStreamer
elements for encoding 40+ formats (MPEG, DivX, MPEG4, AC3, DV, ...), decoding
90+ formats
(AVI, MPEG, OGG, Matroska, ASF, ...), demuxing 30+ formats, and
colorspace conversion.

Finally, there is a package gstreamer0.10-plugins-bad with plugins of potentially suboptimal quality. It includes

  1. /usr/lib/gstreamer-0.10/libgstbz2.so
  2. /usr/lib/gstreamer-0.10/libgstcdxaparse.so
  3. /usr/lib/gstreamer-0.10/libgstdtsdec.so
  4. /usr/lib/gstreamer-0.10/libgstfreeze.so
  5. /usr/lib/gstreamer-0.10/libgstgsm.so
  6. /usr/lib/gstreamer-0.10/libgstmms.so
  7. /usr/lib/gstreamer-0.10/libgstmodplug.so
  8. /usr/lib/gstreamer-0.10/libgstmusepack.so
  9. /usr/lib/gstreamer-0.10/libgstqtdemux.so
  10. /usr/lib/gstreamer-0.10/libgsttrm.so
  11. /usr/lib/gstreamer-0.10/libgstspeed.so
  12. /usr/lib/gstreamer-0.10/libgstswfdec.so
  13. /usr/lib/gstreamer-0.10/libgsttta.so
  14. /usr/lib/gstreamer-0.10/libgstvideo4linux2.so
  15. /usr/lib/gstreamer-0.10/libgstwavpack.so
  16. /usr/lib/gstreamer-0.10/libgstxingheader.so
  17. /usr/lib/gstreamer-0.10/libgstneonhttpsrc.so

With bad you get GSM audio codec support, MMS support, QT playback support for some formats, Flash (SWF) playing support, Video4Linux2 support, MUSEPACK support and a few more.

12Jun/060

Ⲙⲓⲁ ⲁⲕⲟ̀ⲙⲁ ⲉⲅⲅⲣⲁⲫⲏ̀ ⲓⲥⲧⲟⲗⲟⲅⲓ̀ⲟⲩ

Ⲁⲛ ⲙⲡⲟⲣⲉ̀ⲥⲉⲧⲉ ⲛⲁ ⲧⲟ ⲇⲓⲁⲃⲁ̀ⲥⲉⲧⲉ ⲁⲩⲧⲟ̀, ⲉ̀ⲭⲉⲧⲉ ⲥⲧⲟ ⲥⲩ̀ⲥⲧⲏⲙⲁ̀ ⲥⲁⲥ ⲅⲣⲁⲙⲙⲁⲧⲟⲥⲉⲓⲣⲉ̀ⲥ ⲅⲓⲟⲩ̀ⲛⲓⲕⲟⲛⲧ ⲙⲉ ⲩⲡⲟⲥⲧⲏ̀ⲣⲓⲝⲏ ⲕⲟⲡⲧⲓⲕⲱ̀ⲛ ⲕⲁⲓ ⲝⲉ̀ⲣⲉⲧⲉ ⲉⲗⲗⲏⲛⲓⲕⲁ̀.

Ⲅⲓⲁ ⲁⲡ̀ⲟⲇⲉⲓⲝⲏ ⲟ̀ⲧⲓ ⲕⲁⲧⲁⲫⲉ̀ⲣⲁⲧⲉ ⲛⲁ ⲇⲓⲁⲃⲁ̀ⲥⲉⲧⲉ ⲧⲟ ⲕⲉⲓ̀ⲙⲉⲛⲟ ⲁⲩⲧⲟ̀, ⲅⲣⲁ̀ⲯⲧⲉ ⲱⲋ ⲥⲭⲟ̀ⲗⲓⲟ ⲧⲏ ⲫⲣⲁ̀ⲥⲏ "Ⲡⲟⲩ ⲑⲁ ⲃⲣⲓ̀ⲥⲕⲉⲥⲉ ⲥⲧⲓⲥ ⲧⲣⲓⲁ̀ⲛⲧⲁ Ⲙⲁⲓ̈̀ⲟⲩ".

Update: The encoding of this post is UTF-8 (Unicode). You are required to have the appropriate font installed. It is left as an excersize to the reader to figure out which blog entry this post is about (hint: blog entry #553). The carefull reader should be able to check the UTF-8 encoded bytes and discern which Unicode block the post is about. Reading the post requires the knowledge of basic Greek.

12Jun/060

Can you read Coptic?

Coptic is the most recent phase of ancient Egyptian. It is the direct descendant of the ancient language written in Egyptian hieroglyphic, hieratic, and demotic scripts. The Coptic alphabet is a slightly modified form of the Greek alphabet, with some letters (which vary from dialect to dialect) deriving from demotic. As a living language of daily conversation, Coptic flourished from ca. 200 to 1100. The last record of its being spoken was during the 17th century. Coptic survives today as the liturgical language of the Coptic Orthodox Church. Egyptian Arabic is the spoken and national language of Egypt today.

Source: Wikipedia on Coptic Language

Coptic, as used today, has signs of influence from the Greek language. If you speak Greek, you should be able to recognise every entry in the screenshot (it comes from the dictionary that is available from http://copticlang.bizhat.com/).

There is a Coptic Unicode block and there are at least three Unicode fonts available with Coptic glyphs.

I am not aware of a keyboard definition to write Unicode Coptic; Coptic uses several combining diacritical marks (accents) and appears to surpass even Ancient Greek/Polytonic in this respect. An easy way to create (easy to write with?) method would be to start from the Greek keyboard layout and replace the codepoints with the Coptic ones. For the 9 combining diacritical marks, three keys should be dedicated, accessible through 1) pressing as is, 2) pressing with shift, 3) pressing with Alt. To avoid using dead keys, there would be a requirement to type first the letter and then the diacritical mark.

In modern Greek we use the ";:" key (on the right of L) to produce the acute and the diaeresis (with Shift) accents. The second suitable key could be the ' " key while the third the "/?" (debateable).

There are several efforts to convert non-Unicode fonts distributed by the Coptic Church. website. Moheb added the Coptic glyphs to the Freefonts. There is more work required to get them added by default to Linux distros. There is a discussion forum on Coptic.

Therefore, the most important task is to create a keyboard layout so that one can write in Unicode Coptic.

Then, existing (non-Unicode) text should be converted to Unicode Coptic so that there is material available. Moheb created support for this in iconv (glibc). There should be a bug report at http://sources.redhat.com/bugzilla/ under product glibc, component libc.

Source: Wikipedia (Coptic script)

There exist free Unicode fonts already to have the text displayed. The conversion of the Coptic Church fonts to Unicode would be beneficial as well. To have them included in Linux distros, the distribution license should be set to one of the FLOSS licenses. An option could be to add to the DejaVu fonts (allowed by the license) so that there is a general purpose open font that is easy to work with.

I, for one, would love to write Greek using a Coptic keyboard layout and a Coptic Unicode font. :)

Update: Screenshot that demonstrates how well Unicode Coptic fonts behave when combining marks are used.

Update #2: You can test the above on your system by opening this OpenDocument file using OpenOffice.org or any other OpenDocument-compatible application. OpenOffice.org was verified that it can show combining marks. Your mileage may vary, your comments will be appreciated.

Get Unicode fonts with Coptic coverage.

9Mar/062

Taxis and security

It is quite encouraging that citizens taxed in Greece are able to file their tax reports through the Web, at the Taxis Website. Sadly, it has been reported that standard-compliant Web browsers are not supported by the Taxis Website. If you are affected, do complain about it! If you file taxes and you are affected, file a report.
Let's see some more issues.

A. The main login page is not configured properly with regards to the autocomplete feature found in modern browsers; as is, your username and password get saved by default in your browser. If your computer is stolen or a trojan horse gets installed on your computer, your tax details are gone! :(

The Web developer should modify the HTML code from

< span class=“textblue2″>< b>user name: b>span>
< input type=“text” name=“username” maxlength=“40″ size=“15″ value=“testing”>
< P>< span class=“textblue2″>< B>password:B>span>
< input type=“password” name=“password” maxlength=“40″ size=“17″ value=“testing”>

to

< span class=“textblue2″>< b>user name: b>span>
< input type=“text” name=“username” autocomplete=“off” maxlength=“40″...
< P>< span class=“textblue2″>< B>password:B>span>
< input type=“password” name=“password” autocomplete=“off” maxlength=“40″...

B. The page http://webtax.gsis.gr/taxisnet/login.do claims that users are protected by Verisign (SSL/TLS). Quite sadly, the intent has probably been that users will connect through the proper URL, at https://webtax.gsis.gr/taxisnet/login.do. Dear Taxis, you should place an HTTP redirection to move all users to the SSL/TLS-protected URL. You are in breach of your Verisign license!

The image “http://static.flickr.com/55/110197352_d60be48ab3_o.png” cannot be displayed, because it contains errors.

I will follow on the above report here.

Actually, it would be much better if the web server is SSL/TLS only (no plain HTTP version available). The web server should be configured at any access to a URL under http://webtax.gsis.gr/... should redirect to https://webtax.gsis.gr/.
C. What is worst of all, the website provides content in the 8859-7 8-bit legacy encoding. It is much better to convert to Unicode and UTF-8. I do not know if users have to write text in Greek for their tax forms...
I don't file taxes so I am not sure if there are more issues once you logon.

Update: The http://webtax.gsis.gr/taxisnet/login.do URL does not work anymore (it forwards to another Website which is down). I did not hear back from Verisign; it's possible that the two events are linked together.

26Jul/050

░░░▒▒▒▓▓▓███

►☺◄ ۩ ۞

This is a demonstration of the use of Unicode on your Website. These above are fonts, not images. You should be able to view them if you have the appropriate fonts installed. Windows XP users should be able to view them by default if they use Mozilla Firefox.

When you build a Website, you have the option to choose the encoding of the text. Typically people choose iso-8859-1, windows-1251 or similar encodings limited to just 255 characters. If you choose, however, utf-8, you can add any character from a repertoire of several thousand characters, from any written language, including symbols and dingbats.

4Dec/04Off

Ελληνικά στην αλληλογραφία, μέρος πρώτο

Πρέπει να λαμβάνετε γράμματα / ανακοινώσεις από μερικούς δικτυακούς τόπους όπου η κωδικοποίηση για τα ελληνικά δεν είναι σωστή, είτε στο σώμα του μηνύματος, είτε στην κεφαλίδα (From: "Ανακοίνωση" ).

Συγκεκριμένα, δεν καθορίζεται η κωδικοποίηση οπότε είναι θέμα εξ ορισμού ρυθμίσεων του παραλήπτη για να δει το αποτέλεσμα.

Ας δούμε πως μπορείτε μέσα από μια εφαρμογή PHP να στείλετε αλληλογραφία με ελληνικά. Το ίδιο μπορεί να γίνει και από άλλες γλώσσες, όπως Perl και Python.

<?php
include('Mail.php');
include('Mail/mime.php');

$from = "From: \"" . mb_encode_mimeheader('Όνομα Αποστολέα') . "\" < αποστολέας στο gmail τελεία com>";
$to = mb_encode_mimeheader('Όνομα Παραλήπτη') . " < παραλήπτης στο gmail τελεία com>";
$subject = 'Θέμα γράμματος';
$body = 'Περιεχόμενο του γράμματος.';

mb_send_mail($to, $subject, $body, $from);
?>

Το γράμμα που θα παραχθεί θα μοιάζει με

Από: Όνομα Αποστολέα < αποστολέας στο gmail τελεία com>
Προς: Όνομα Παραλήπτη < παραλήπτης στο gmail τελεία com>
Θέμα: Θέμα γράμματος

Περιεχόμενο του γράμματος.

Απαιτεί την εγκατάσταση του πακέτου php-mbstring που το έχουν όλες οι καλές διανομές Linux. Διαφορετικά είναι δυνατόν
να έχετε το ίδιο αποτέλεσμα αλλά θα κάνετε τα παραπάνω χειρωνακτικά.

Ακόμα, πρέπει να ρυθμίσετε το /etc/php.ini με τα παρακάτω:

[mbstring]
; language for internal character representation.
; Neutral σημαίνει Unicode
mbstring.language = Neutral

; internal/script encoding.
; Some encoding cannot work as internal encoding.
; (e.g. SJIS, BIG5, ISO-2022-*)
mbstring.internal_encoding = UTF-8

; http input encoding.
mbstring.http_input = UTF-8

; http output encoding. mb_output_handler must be
; registered as output buffer to function
mbstring.http_output = UTF-8

; enable automatic encoding translation accoding to
; mbstring.internal_encoding setting. Input chars are
; converted to internal encoding by setting this to On.
; Note: Do _not_ use automatic encoding translation for
; portable libs/applications.
mbstring.encoding_translation = On

; substitute_character used when character cannot be converted
; one from another
; σημαίνει ότι στην μετατροπή αν κάτι πάει στραβά, θα εκτυπώσει των κωδικό U+xxxx του χαρακτήρα.
mbstring.substitute_character = long;

Αν είστε χρήστης της εφαρμογής phplist, ενημερώστε τη σελίδα αυτή.

Σημείωση: Όλα τα παραπάνω είναι σε κωδικοποίηση utf-8 (Unicode).

13Sep/04Off

iso-8859-7 ή….. utf-8;

Ασφαλώς και utf-8!

Δυστυχώς δεν έχει περάσει ακόμα το μήνυμα για προτίμηση του Unicode (κωδικοποίηση utf-8) αντί του iso-8859-7. Πριν από μερικά χρόνια υπήρχαν σημαντικές εφαρμογές που δεν μπορούσαν να απεικονίσουν αλφαριθμητικά με κωδικοποίηση utf-8, ωστόσο αυτό έχει αλλάξει και δενυπάρχει δικαιολογία μη-μετάβασης.

  1. Με την κωδικοποίηση iso-8859-7 μπορείς να απεικονίσεις μόνο
    αγγλικά και ελληνικά, με αποτέλεσμα τα διεθνή WebMail (Yahoo, Hotmail,
    κτλ) να μην είναι ποτέ σε θέση να δείξουν ελληνικά (διότι ως διεθνή δεν
    είναι σε θέση να θέσουν την ελληνική μόνο κωδικοποίηση, οπότε δεν
    θέτουν καμία!). Ίσως είναι καλύτερα κατανοητό ότι σε μια σελίδα HTML
    μπορεί να καθοριστεί μόνο μια (καθολική) κωδικοποίηση. Αυτή τη στιγμή
    το Yahoo Mail δεν καθορίζει κωδικοποίηση στις σελίδες με αποτέλεσμα να
    πρέπει να κάνετε εξωτερικές ρυθμίσεις για την εμφάνιση ελληνικών
    (View/Encoding/...). Προσπαθούν να το φτιάξουν αλλά φαίνεται ότι θα δυσκολευτούν λόγω της πολύ μεγάλης εγκατεστημένης βάσης. Η ίδια κατάσταση επικρατεί και στα Hotmail, GMX.net, κτλ. Σε
    αντίθεση, το "νέο" GMail χρησιμοποιεί utf-8 :) .
  2. Τα νέα πρότυπα που βασίζονται σε XML θεωρούν ως εξ ορισμού
    κωδικοποίηση το utf-8, εκτός και αν τους καθορίσεις iso-8859-7...
    Προτιμούν utf-8 για κάποιο λόγο.
  3. Όλες οι νέες διανομές υποστηρίζουν locales utf-8, όπως και τα
    γραφικά περιβάλοντα GNOME (από 2.0+) και KDE (από ακόμα πιο παλιά).
  4. Είναι εύκολη η μετατροπή ενός αρχείου από iso-8859-7 σε utf-8 με την εντολή iconv. Απλά εκτελέστε iconv -f iso-8859-7 -t utf-8 < mygreek.txt > mygreekUTF.txt. Το ίδιο μπορεί να γίνει και σε ένα δικτυακό τόπο (iconv στα αρχεία και μετά προσθήκη <meta content="text/html; charset=UTF-8" http-equiv="content-type"> στην αρχή τους). Το ίδιο στο περιεχόμενο μιας βάσης (π.χ. CMS), αποτύπωση/dump της βάσης, iconv και τέλος εισαγωγή ξανά.

Αν ένα γράμμα φαίνεται σαν

  1. "...φτι...". Γράφτηκε και στάλθηκε ως utf-8 χωρίς όμως το πρόγραμμα ηλεκτρονικής αλληλογραφίας να μπει στον κόπο να καθορίσει την κωδικοποίηση. Ο
    παραλήπτης (δηλ. το πρόγραμμά του) δεν είχε ιδέα πως να το
    αποκωδικοποιήση και εμφάνισε σε δεκαδική μορφή τους κωδικούς των
    ελληνικών χαρακτήρων. Με αναφορά στον πίνακα Unicode θα μπορούσε
    κάποιος να διαβάσει το γράμμα. Αν το πρόγραμμα του παραλήπτη έδειχνε το
    γράμμα σε "ωμή/raw" μορφή, θα μπορούσε κάποιος να αλλάξει την
    κωδικοποίηση σε utf-8 για να το δει.
  2. "..ΞΈΞ­ΞΌΞ±Ο�Ξ± Ξ±Ξ³...", (δηλαδή "πολλά Ξ"). Σημαίνει ότι το κείμενο είναι πραγματικά σε μορφή utf-8 αλλά πρόγραμμα ηλεκ. αλληλογραφίας/φυλλομετρητής/κτλ το δείχνει ως iso-8859-7. Προσωρινή λύση, αλλάξτε κωδικοποίησε στο πρόγραμμα σε utf-8.
  3. "...¼Î±Ï„α αγορά..", (δηλαδή "πολλά Ι και διαλυτικά/καπελάκι). Όπως παραπάνω, αλλά το δείχνει ως iso-8859-1. Προσωρινή λύση, αλλάξτε κωδικοποίησε στο πρόγραμμα σε utf-8.
  4. "...?????????....", (δηλαδή πολλά αγγλικά ερωτηματικά). Αυτή
    είναι η χειρότερη κατάσταση μιας και το κείμενο μετατράπηκε σε μορφή
    7-bit με αποτέλεσμα να χαθεί για πάντα σημαντική πληροφορία για την
    απεικόνισή του.

Τι να κάνουμε;

  1. Ρίχνουμε μια ματιά στο πρόγραμμα ηλεκ. αλληλογραφίας μας και το
    ρυθμίζουμε να στέλνει γράμματα με την κωδικοποίηση utf-8, ακόμα και αν
    ποτέ δεν γράψουμε ελληνικά (όταν απαντάμε σε γράμμα που έχει και
    ελληνικά, με αυτόν το τρόπο τα διατηρούμε. hint: lgu). Για παράδειγμα,
    στο Mozilla Thunderbird είναι
    Εργαλεία/Επιλογές/Γραμματοσειρές/Γλώσσες/Εισερχόμενα-Εξερχόμενα και
    επιλέγουμε "Unicode (UTF-8)" και στα δύο.
  2. Ρυθμίζουμε τον εξυπηρετητή Web μας να μην καθορίζει, σώνει και
    καλά, την κωδικοποίηση σε iso-8859-7 όταν επιστρέφει σελίδες στους
    πελάτες. Για την αποφυγή παρεξηγήσεων (σε άλλους νεώτερους δικτυακούς
    τόπους), δείτε στο http://nls.hellug.gr/. Η σελίδα δείχνει σωστά τα
    ελληνικά χωρίς να χρειαστεί να αλλάξουμε κωδικοποίηση (σε iso-8859-7).
    Πως το ξέρει; Διότι ο εξυπηρετητής Web χώνει την κωδικοποίηση όταν
    επιστρέφει την κεφαλίδα. Εκτελέστε telnet nls.hellug.gr 80, έπειτα γράψτε GET / HTTP/1.0, σε νέα γραμμή γράψτε Host: nls.hellug.gr
    και πατήστε Enter δύο φορές. Θα δείτε στη κεφαλίδα το άσχημο
    iso-8859-7... Τυπικό σφάλμα όταν γράφετε τη σελίδα σε utf-8, ρυθμίζετε
    την κωδικοποίηση μέσω HTML σε UTF-8 ενώ βλέπετε ακαταλλαβίστικα με τα
    πολλά ΞΞΞ (δείτε παραπάνω).
  3. Στη μεταβλητή του Linux LANG βάζουμε el_GR.UTF-8 αντί του σκέτου el_GR. Το σκέτο σημαίνει el_GR.ISO-8859-7. Εξ ορισμού βάζει την κατάληξη .UTF-8, το κάνει για κάποιο λόγο.
  4. Σε κονσόλα (όχι xterm/konsole/gnome-terminal/...) μπορείτε να
    γράψετε/διαβάσετε ελληνικά ακολουθώντας τα του
    xhref="http://www.ellak.gr/modules.php?op=modload&name=phpWiki&file=index&pagename=GreekWritingInLinux" mce_href="http://www.ellak.gr/modules.php?op=modload&name=phpWiki&file=index&pagename=GreekWritingInLinux"
    title="Γράψιμο ελληνικών στην κονσόλα του Linux">GreekWritingInLinux.
    (Αν δεν δουλέψει με την μία, βρείτε τη λύση και κάντε την γνωστή
    (νομίζω έχει αλλάξει κάτι μικρό στο πακέτο kbd από τότε)).
  5. Σε xterm/konsole/gnome-terminal ασφαλώς επιλέξουμε μια
    γραμματοσειρά fixed που περιλαμβάνει ελληνικά (η εξ ορισμού του
    X.org/XFree86 έχει).
  6. Στο Putty καθορίζουμε στις επιλογές Window/Translation την κωδικοποίηση UTF-8.
  7. Ξεκινάμε flame στη λίστα συνδρομητών μας για το πόσο καλό είναι το utf-8 και πόσο αναχρονιστικό το iso-8859-7.

Λοιπόν;

Switch to our mobile site