$node = stdClass Object (
[nid] => [807]
[type] => [blog]
[language] => []
[uid] => [4]
[status] => [1]
[created] => [1263231155]
[changed] => [1263231155]
[comment] => [2]
[promote] => [1]
[moderate] => [0]
[sticky] => [0]
[tnid] => [0]
[translate] => [0]
[vid] => [813]
[revision_uid] => [4]
[title] => [Unicode, UTF-8, and All That: A Short Note]
[body] => [<p>I don't know how many times I've heard people conflate UTF-8 with Unicode. I have come to believe that distinguishing these is the first essential step in becoming "internationalization-literate".</p>
<p>Perhaps the problem is that representing a character according to the Unicode standard involves two levels of indirection (or more--see <a href="http://unicode.org/reports/tr17/" title="http://unicode.org/reports/tr17/">http://unicode.org/reports/tr17/</a> for all the nitty-gritty details). The first is the assignment of a number to every character of each of the world's languages. To quote from the Unicode Consortium Web site: "Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language." Since there are more than 256 such characters, this isn't the end of the story. The second stage is the representation of this number as a byte sequence. This is the "encoding" phase. The Unicode standard lays out several encoding methods--UTF-8 is just one of many. (What distinguishes UTF-8 is that all ASCII characters require just one byte. Others, such as UCS-2, require multiple bytes for <em>every</em> character, and thus waste a lot of space when used for predominantly ASCII text.)</p>
]
[log] => []
[revision_timestamp] => [1263231155]
[format] => [2]
[name] => [Scott]
[picture] => [sites/default/files/users/avatars/picture-4.png]
[data] => [a:15:{s:13:"form_build_id";s:37:"form-dbd1fc4e7f874707ca71243bb47c1c6e";s:17:"fckeditor_default";s:1:"f";s:21:"fckeditor_show_toggle";s:1:"t";s:15:"fckeditor_popup";s:1:"f";s:14:"fckeditor_skin";s:7:"default";s:17:"fckeditor_toolbar";s:10:"DrupalFull";s:16:"fckeditor_expand";s:1:"t";s:15:"fckeditor_width";s:4:"100%";s:14:"fckeditor_lang";s:2:"en";s:19:"fckeditor_auto_lang";s:1:"t";s:7:"genpass";s:21:"Generate new password";s:28:"fckeditor_show_fieldnamehint";s:1:"t";s:14:"picture_delete";s:0:"";s:14:"picture_upload";s:0:"";s:7:"contact";i:0;}]
[path] => [blog/unicode-utf-8-and-all-short-note]
[tags] => []
[_workflow] => []
[simple_access_owner] => array (
[sa_view] => [0]
[sa_update] => [0]
[sa_delete] => [0]
)
[simple_access] => array (
)
[last_comment_timestamp] => [1263231155]
[last_comment_name] => []
[comment_count] => [0]
[taxonomy] => array (
)
[files] => array (
)
[build_mode] => [0]
[readmore] => [1]
[content] => array (
[body] => array (
[#weight] => [-4]
[#value] => [<p>I don't know how many times I've heard people conflate UTF-8 with Unicode. I have come to believe that distinguishing these is the first essential step in becoming "internationalization-literate".</p>
<p>Perhaps the problem is that representing a character according to the Unicode standard involves two levels of indirection (or more--see <a href="http://unicode.org/reports/tr17/" title="http://unicode.org/reports/tr17/">http://unicode.org/reports/tr17/</a> for all the nitty-gritty details). The first is the assignment of a number to every character of each of the world's languages. To quote from the Unicode Consortium Web site: "Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language." Since there are more than 256 such characters, this isn't the end of the story. The second stage is the representation of this number as a byte sequence. This is the "encoding" phase. The Unicode standard lays out several encoding methods--UTF-8 is just one of many. (What distinguishes UTF-8 is that all ASCII characters require just one byte. Others, such as UCS-2, require multiple bytes for <em>every</em> character, and thus waste a lot of space when used for predominantly ASCII text.)</p>
]
[#title] => []
[#description] => []
[#printed] => [1]
)
[#content_extra_fields] => array (
[title] => array (
[label] => [Title]
[description] => [Node module form.]
[weight] => [-5]
)
[body_field] => array (
[label] => [Body]
[description] => [Node module form.]
[weight] => [-4]
[view] => [body]
)
[revision_information] => array (
[label] => [Revision information]
[description] => [Node module form.]
[weight] => [-2]
)
[comment_settings] => array (
[label] => [Comment settings]
[description] => [Comment module form.]
[weight] => [5]
)
[menu] => array (
[label] => [Menu settings]
[description] => [Menu module form.]
[weight] => [-3]
)
[taxonomy] => array (
[label] => [Taxonomy]
[description] => [Taxonomy module form.]
[weight] => [4]
)
[book] => array (
[label] => [Book]
[description] => [Book module form.]
[weight] => [1]
)
[path] => array (
[label] => [Path settings]
[description] => [Path module form.]
[weight] => [0]
)
[attachments] => array (
[label] => [File attachments]
[description] => [Upload module form.]
[weight] => [-1]
[view] => [files]
)
[flag] => array (
[label] => [Flags]
[description] => [Flags fieldset.]
[weight] => [0]
)
[workflow] => array (
[label] => [Workflow]
[description] => [Workflow module form]
[weight] => [3]
)
[simple_access] => array (
[label] => [Simple Access]
[description] => [Simple Access module form.]
[weight] => [2]
)
)
[#pre_render] => array (
[0] => [content_alter_extra_weights]
)
[#title] => []
[#description] => []
[#children] => [<p>I don't know how many times I've heard people conflate UTF-8 with Unicode. I have come to believe that distinguishing these is the first essential step in becoming "internationalization-literate".</p>
<p>Perhaps the problem is that representing a character according to the Unicode standard involves two levels of indirection (or more--see <a href="http://unicode.org/reports/tr17/" title="http://unicode.org/reports/tr17/">http://unicode.org/reports/tr17/</a> for all the nitty-gritty details). The first is the assignment of a number to every character of each of the world's languages. To quote from the Unicode Consortium Web site: "Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language." Since there are more than 256 such characters, this isn't the end of the story. The second stage is the representation of this number as a byte sequence. This is the "encoding" phase. The Unicode standard lays out several encoding methods--UTF-8 is just one of many. (What distinguishes UTF-8 is that all ASCII characters require just one byte. Others, such as UCS-2, require multiple bytes for <em>every</em> character, and thus waste a lot of space when used for predominantly ASCII text.)</p>
]
[#printed] => [1]
)
[links] => array (
[blog_usernames_blog] => array (
[title] => [Scott's blog]
[href] => [blog/4]
[attributes] => array (
[title] => [Read Scott's latest blog entries.]
)
)
[comment_add] => array (
[title] => [Add new comment]
[href] => [comment/reply/807]
[attributes] => array (
[title] => [Share your thoughts and opinions related to this posting.]
)
[fragment] => [comment-form]
)
)
);