{"id":108,"date":"2026-03-27T07:36:50","date_gmt":"2026-03-27T07:36:50","guid":{"rendered":"https:\/\/www.charactercodes.net\/blog\/?p=108"},"modified":"2026-03-27T07:49:14","modified_gmt":"2026-03-27T07:49:14","slug":"how-unicode-actually-works","status":"publish","type":"post","link":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/","title":{"rendered":"How Unicode Actually Works"},"content":{"rendered":"<style>:root { --ink: #1a1410; --paper: #faf7f2; --cream: #f0ebe0; --accent: #c84b2f; --accent-soft: #f0e0db; --gold: #b8860b; --muted: #7a6f62; --border: #d8d0c4; --code-bg: #1a1410; --code-text: #e8e0d4; --highlight1: #ffe8a0; }\n.article-wrap { max-width: 1200px; margin: 0 auto; }\n.article-wrap .toc { background: var(--cream); border-left: 3px solid var(--accent); border-radius: 0 8px 8px 0; padding: 28px 32px; margin-bottom: 64px; }\n.article-wrap .toc-title { font-family: 'DM Mono', monospace; font-size: 11px; letter-spacing: 0.15em; text-transform: uppercase; color: var(--muted); margin-bottom: 16px; }\n.article-wrap .toc ol { list-style: none; counter-reset: toc; display: grid; grid-template-columns: 1fr 1fr; gap: 8px 24px; }\n@media (max-width: 600px) { .article-wrap .toc ol { grid-template-columns: 1fr; } }\n.article-wrap .toc li { counter-increment: toc; }\n.article-wrap .toc a { color: var(--ink); text-decoration: none; font-size: 0.9rem; display: flex; align-items: baseline; gap: 10px; transition: color 0.2s; }\n.article-wrap .toc a:hover { color: var(--accent); }\n.article-wrap .toc a::before { content: counter(toc, decimal-leading-zero); font-family: 'DM Mono', monospace; font-size: 10px; color: var(--accent); flex-shrink: 0; }\n.article-wrap .section { margin-bottom: 72px; }\n.article-wrap .section-label { font-family: 'DM Mono', monospace; font-size: 11px; letter-spacing: 0.18em; text-transform: uppercase; color: var(--accent); margin-bottom: 12px; display: flex; align-items: center; gap: 10px; }\n.article-wrap .section-label::after { content: ''; flex: 1; height: 1px; background: var(--border); }\n.article-wrap h2 { font-family: serif; font-size: clamp(1.6rem, 3.5vw, 2.2rem); font-weight: 700; line-height: 1.2; letter-spacing: -0.02em; margin-bottom: 20px; }\n.article-wrap h3 { font-family: serif; font-size: 1.3rem; font-weight: 300; font-style: italic; margin: 36px 0 12px; color: var(--ink); }\n.article-wrap p { margin-bottom: 20px; }\n.article-wrap strong { font-weight: 600; color: var(--ink); }\n.article-wrap code { font-family: 'DM Mono', monospace; font-size: 0.84em; background: var(--cream); border: 1px solid var(--border); padding: 2px 7px; border-radius: 4px; color: var(--accent); letter-spacing: 0; }\n.article-wrap .code-block { background: var(--code-bg); color: var(--code-text); border-radius: 10px; padding: 28px 32px; margin: 28px 0; font-family: 'DM Mono', monospace; font-size: 0.85rem; line-height: 1.8; overflow-x: auto; position: relative; }\n.article-wrap .code-block .cb-label { position: absolute; top: 12px; right: 16px; font-size: 10px; letter-spacing: 0.12em; text-transform: uppercase; color: #555; }\n.article-wrap .code-block .kw { color: #f08; }\n.article-wrap .code-block .cm { color: #666; font-style: italic; }\n.article-wrap .code-block .str { color: #9f9; }\n.article-wrap .code-block .num { color: #f90; }\n.article-wrap .code-block .op { color: #8cf; }\n.article-wrap .callout { border-radius: 10px; padding: 22px 28px; margin: 28px 0; display: flex; gap: 16px; align-items: flex-start; }\n.article-wrap .callout.info { background: #eef3fb; border-left: 3px solid #5080cc; }\n.article-wrap .callout.warn { background: #fdf4e3; border-left: 3px solid var(--gold); }\n.article-wrap .callout.key { background: var(--accent-soft); border-left: 3px solid var(--accent); }\n.article-wrap .callout-icon { font-size: 1.3rem; flex-shrink: 0; margin-top: 2px; }\n.article-wrap .callout p { margin: 0; font-size: 0.95rem; }\n.article-wrap .callout strong { display: block; margin-bottom: 4px; }\n.article-wrap .table-wrap { overflow-x: auto; margin: 28px 0; border-radius: 10px; border: 1px solid var(--border); }\n.article-wrap table { width: 100%; border-collapse: collapse; font-size: 0.9rem; }\n.article-wrap thead { background: var(--ink); color: var(--paper); }\n.article-wrap thead th { padding: 14px 18px; text-align: left; font-family: 'DM Mono', monospace; font-size: 11px; letter-spacing: 0.1em; text-transform: uppercase; font-weight: 400; }\n.article-wrap tbody tr:nth-child(even) { background: var(--cream); }\n.article-wrap tbody td { padding: 12px 18px; border-bottom: 1px solid var(--border); font-family: 'DM Mono', monospace; font-size: 0.84rem; vertical-align: top; }\n.article-wrap tbody td.label { font-family: 'Instrument Sans', sans-serif; font-size: 0.9rem; }\n.article-wrap tbody tr:last-child td { border-bottom: none; }\n.article-wrap .glyph-demo { background: var(--ink); color: var(--paper); border-radius: 12px; padding: 40px; margin: 28px 0; display: flex; align-items: center; gap: 40px; flex-wrap: wrap; }\n.article-wrap .big-glyph { font-family: 'Fraunces', serif; font-size: 5rem; line-height: 1; flex-shrink: 0; }\n.article-wrap .glyph-info { flex: 1; min-width: 200px; }\n.article-wrap .glyph-info dl { display: grid; grid-template-columns: auto 1fr; gap: 8px 20px; align-items: baseline; }\n.article-wrap .glyph-info dt { font-family: 'DM Mono', monospace; font-size: 10px; letter-spacing: 0.15em; text-transform: uppercase; color: #888; }\n.article-wrap .glyph-info dd { font-family: 'DM Mono', monospace; font-size: 0.85rem; color: #ccc; }\n.article-wrap .glyph-info dd.highlight1 { color: var(--accent); }\n.article-wrap .encoding-visual { margin: 32px 0; display: flex; flex-direction: column; gap: 4px; }\n.article-wrap .enc-row { display: flex; align-items: stretch; gap: 4px; }\n.article-wrap .enc-label { font-family: 'DM Mono', monospace; font-size: 11px; letter-spacing: 0.08em; width: 90px; flex-shrink: 0; display: flex; align-items: center; color: var(--muted); }\n.article-wrap .enc-bytes { display: flex; gap: 4px; flex-wrap: wrap; }\n.article-wrap .enc-byte { font-family: 'DM Mono', monospace; font-size: 0.78rem; padding: 8px 12px; border-radius: 6px; font-weight: 500; letter-spacing: 0.04em; }\n.article-wrap .byte-header { background: #c84b2f22; border: 1px solid var(--accent); color: var(--accent); }\n.article-wrap .byte-data { background: #1a7cc422; border: 1px solid #1a7cc4; color: #1a7cc4; }\n.article-wrap .byte-cont { background: #1d9e5622; border: 1px solid #1d9e56; color: #1d9e56; }\n.article-wrap .planes-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(130px, 1fr)); gap: 8px; margin: 28px 0; }\n.article-wrap .plane-card { border: 1px solid var(--border); border-radius: 8px; padding: 14px; transition: border-color 0.2s, background 0.2s; }\n.article-wrap .plane-card:hover { border-color: var(--accent); background: var(--accent-soft); }\n.article-wrap .plane-num { font-family: 'DM Mono', monospace; font-size: 10px; color: var(--accent); letter-spacing: 0.1em; margin-bottom: 6px; }\n.article-wrap .plane-name { font-weight: 600; font-size: 0.82rem; margin-bottom: 4px; }\n.article-wrap .plane-desc { font-size: 0.76rem; color: var(--muted); line-height: 1.4; }\n.article-wrap .pullquote { border-top: 2px solid var(--ink); border-bottom: 2px solid var(--ink); padding: 28px 0; margin: 48px 0; }\n.article-wrap .pullquote p { font-family: 'Fraunces', serif; font-size: clamp(1.2rem, 2.5vw, 1.6rem); font-style: italic; line-height: 1.4; color: var(--ink); margin: 0; }\n.article-wrap mark { background: var(--highlight1); padding: 1px 4px; border-radius: 3px; }\n@media (max-width: 600px) { .article-wrap .article-wrap { padding: 40px 20px 80px; } .article-wrap .hero { padding: 56px 24px 52px; } .article-wrap .site-header { padding: 12px 20px; } .article-wrap .glyph-demo { gap: 24px; padding: 28px; } }<\/style>\n<p><!-- ARTICLE --><br \/>\n<main class=\"article-wrap\"><\/p>\n<p>  <!-- TOC --><\/p>\n<nav class=\"toc\" aria-label=\"Table of contents\">\n<div class=\"toc-title\">In this article<\/div>\n<ol>\n<li><a href=\"#before-unicode\">Before Unicode: the chaos<\/a><\/li>\n<li><a href=\"#code-points\">What is a code point?<\/a><\/li>\n<li><a href=\"#blocks\">Blocks &amp; scripts<\/a><\/li>\n<li><a href=\"#planes\">The 17 planes<\/a><\/li>\n<li><a href=\"#encodings\">UTF-8, UTF-16, UTF-32<\/a><\/li>\n<li><a href=\"#surrogates\">Surrogates explained<\/a><\/li>\n<li><a href=\"#graphemes\">Code points &#x2260; characters<\/a><\/li>\n<li><a href=\"#tldr\">TL;DR cheat sheet<\/a><\/li>\n<\/ol>\n<\/nav>\n<p>  <!-- \u00a7 1 --><\/p>\n<section class=\"section\" id=\"before-unicode\">\n<div class=\"section-label\">Section 01<\/div>\n<h2>Before Unicode: Pure Chaos<\/h2>\n<p>Imagine you&rsquo;re writing a letter in 1989. You type on a French computer, your colleague opens the file on a Japanese machine, and half the letters turn into random symbols. This happened <em>all the time<\/em>.<\/p>\n<p>The root cause was simple: there was no agreement on which number represents which letter. <strong>ASCII<\/strong> covered 128 characters &#8211; enough for English, but it left the rest of the world out in the cold. Then came dozens of competing standards: <code>Latin-1<\/code>, <code>Shift-JIS<\/code>, <code>KOI8-R<\/code>, <code>Big5<\/code>&#8230; each one a different dialect of a language nobody could agree on.<\/p>\n<div class=\"callout warn\">\n<div class=\"callout-icon\">&#x26A0;&#xFE0F;<\/div>\n<p><strong>The Mojibake problem<\/strong> &ldquo;Mojibake&rdquo; (&#x6587;&#x5B57;&#x5316;&#x3051;) is Japanese for &ldquo;character transformation&rdquo; &mdash; it describes the garbled text you get when a file encoded in one system is decoded with another. If you&rsquo;ve ever opened a file and seen &ldquo;&#xEF;&#xBF;&#xBD;&#xEF;&#xBF;&#xBD;&rdquo; everywhere, you&rsquo;ve met mojibake.<\/p>\n<\/p><\/div>\n<p>In 1987, engineers at Apple and Xerox started designing a single universal standard. The <strong>Unicode Consortium<\/strong> was incorporated in 1991, and Unicode 1.0 shipped with 7,161 characters. Today, Unicode 17.0 defines over <strong>150,000 characters<\/strong> spanning 161 modern and historic scripts.<\/p>\n<\/section>\n<p>  <!-- \u00a7 2 --><\/p>\n<section class=\"section\" id=\"code-points\">\n<div class=\"section-label\">Section 02<\/div>\n<h2>What Is a Code Point?<\/h2>\n<p>Here&rsquo;s the core idea: Unicode is just a giant, agreed-upon <strong>lookup table<\/strong>. Every character in the universe gets a unique number. That number is called a <mark>code point<\/mark>.<\/p>\n<div class=\"glyph-demo\" aria-label=\"Anatomy of a code point\">\n<div class=\"big-glyph\">&#x1F600;<\/div>\n<div class=\"glyph-info\">\n<dl>\n<dt>Character<\/dt>\n<dd>GRINNING FACE<\/dd>\n<dt>Code Point<\/dt>\n<dd class=\"highlight1\">U+1F600<\/dd>\n<dt>Decimal<\/dt>\n<dd>128512<\/dd>\n<dt>Block<\/dt>\n<dd>Emoticons<\/dd>\n<dt>Plane<\/dt>\n<dd>Supplementary Multilingual (1)<\/dd>\n<dt>Category<\/dt>\n<dd>So (Other Symbol)<\/dd>\n<\/dl><\/div>\n<\/p><\/div>\n<p>The <code>U+<\/code> prefix is just notation &mdash; it means &ldquo;Unicode code point.&rdquo; The number that follows is in <strong>hexadecimal<\/strong> (base-16). So <code>U+0041<\/code> is decimal 65, which is the letter <strong>A<\/strong>.<\/p>\n<div class=\"code-block\">\n      <span class=\"cb-label\">notation<\/span><br \/>\n<span class=\"cm\"># The letter &#8216;A&#8217;<\/span><br \/>\nU+0041  \u2190  hex 41  \u2190  decimal 65<\/p>\n<p><span class=\"cm\"># Greek lowercase pi (\u03c0)<\/span><br \/>\nU+03C0  \u2190  hex 3C0  \u2190  decimal 960<\/p>\n<p><span class=\"cm\"># The grinning face emoji \ud83d\ude00<\/span><br \/>\nU+1F600  \u2190  hex 1F600  \u2190  decimal 128512\n    <\/div>\n<p>Unicode can theoretically address up to <strong>1,114,112<\/strong> code points (written <code>U+000000<\/code> to <code>U+10FFFF<\/code>). Think of it as a post office that pre-assigned every possible address in a city, even ones that don&rsquo;t have buildings yet.<\/p>\n<div class=\"callout info\">\n<div class=\"callout-icon\">&#x1F4A1;<\/div>\n<p><strong>Code point &#x2260; byte<\/strong> This is the most important thing to keep in your head as you read on. A code point is just an abstract number &mdash; an ID. How that number actually gets stored on disk or sent over a network is a separate question, answered by <em>encodings<\/em> like UTF-8 (covered in Section 5).<\/p>\n<\/p><\/div>\n<\/section>\n<p>  <!-- \u00a7 3 --><\/p>\n<section class=\"section\" id=\"blocks\">\n<div class=\"section-label\">Section 03<\/div>\n<h2>Blocks &amp; Scripts<\/h2>\n<p>Code points aren&rsquo;t scattered randomly. Unicode organizes them into <mark>blocks<\/mark>: contiguous ranges of code points that belong to a related group. Think of a block as a zip code &mdash; it tells you roughly which neighborhood a character lives in.<\/p>\n<div class=\"table-wrap\">\n<table>\n<thead>\n<tr>\n<th>Block Name<\/th>\n<th>Range<\/th>\n<th>Sample<\/th>\n<th>Size<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td class=\"label\">Basic Latin<\/td>\n<td>U+0000\u2013U+007F<\/td>\n<td>A B C 1 2 ! @<\/td>\n<td>128<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">Latin-1 Supplement<\/td>\n<td>U+0080\u2013U+00FF<\/td>\n<td>&AElig; &Ntilde; &uuml; &ccedil;<\/td>\n<td>128<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">Greek and Coptic<\/td>\n<td>U+0370\u2013U+03FF<\/td>\n<td>&#x03B1; &#x03B2; &#x03C0; &#x03A9;<\/td>\n<td>144<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">Arabic<\/td>\n<td>U+0600\u2013U+06FF<\/td>\n<td>&#x0627; &#x0628; &#x062A; &#x0644;<\/td>\n<td>256<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">CJK Unified Ideographs<\/td>\n<td>U+4E00\u2013U+9FFF<\/td>\n<td>&#x4E2D; &#x6587; &#x4E16; &#x754C;<\/td>\n<td>20,902<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">Emoticons<\/td>\n<td>U+1F600\u2013U+1F64F<\/td>\n<td>&#x1F600; &#x1F604; &#x1F62D; &#x1F60D;<\/td>\n<td>80<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">Musical Symbols<\/td>\n<td>U+1D100\u2013U+1D1FF<\/td>\n<td>&#x1D11E; &#x1D122; &#x1D12A;<\/td>\n<td>256<\/td>\n<\/tr>\n<\/tbody>\n<\/table><\/div>\n<p>A <strong>script<\/strong> is a slightly different concept &mdash; it&rsquo;s the writing system itself (Latin, Arabic, Hangul, Devanagari). One script can span multiple blocks, and one block can technically contain characters from multiple scripts, though Unicode tries hard to keep things tidy.<\/p>\n<div class=\"callout info\">\n<div class=\"callout-icon\">&#x1F4D6;<\/div>\n<p><strong>The CJK Unified Ideographs block is enormous<\/strong> Chinese, Japanese, and Korean share many characters that look identical or nearly identical. Rather than encoding them separately, Unicode &ldquo;unified&rdquo; them into one block. This is sometimes controversial &mdash; purists argue they&rsquo;re distinct &mdash; but it saved tens of thousands of code points.<\/p>\n<\/p><\/div>\n<\/section>\n<p>  <!-- \u00a7 4 --><\/p>\n<section class=\"section\" id=\"planes\">\n<div class=\"section-label\">Section 04<\/div>\n<h2>The 17 Planes<\/h2>\n<p>Unicode&rsquo;s 1,114,112 code points are organized into <mark>17 planes<\/mark>, each containing 65,536 (= 2<sup>16<\/sup>) code points. Think of a plane as a floor in a very tall building, and each floor is a 256&times;256 grid of apartments.<\/p>\n<div class=\"planes-grid\">\n<div class=\"plane-card\">\n<div class=\"plane-num\">PLANE 0<\/div>\n<div class=\"plane-name\">BMP<\/div>\n<div class=\"plane-desc\">Basic Multilingual Plane. 99% of all text you&rsquo;ll ever encounter.<\/div>\n<\/p><\/div>\n<div class=\"plane-card\">\n<div class=\"plane-num\">PLANE 1<\/div>\n<div class=\"plane-name\">SMP<\/div>\n<div class=\"plane-desc\">Supplementary Multilingual. Historic scripts, math symbols, emoji.<\/div>\n<\/p><\/div>\n<div class=\"plane-card\">\n<div class=\"plane-num\">PLANE 2<\/div>\n<div class=\"plane-name\">SIP<\/div>\n<div class=\"plane-desc\">Supplementary Ideographic. Rare &amp; historic CJK ideographs.<\/div>\n<\/p><\/div>\n<div class=\"plane-card\">\n<div class=\"plane-num\">PLANE 3<\/div>\n<div class=\"plane-name\">TIP<\/div>\n<div class=\"plane-desc\">Tertiary Ideographic. Very rare CJK characters.<\/div>\n<\/p><\/div>\n<div class=\"plane-card\" style=\"border-style:dashed; opacity:0.5;\">\n<div class=\"plane-num\">PLANES 4\u201313<\/div>\n<div class=\"plane-name\">Unassigned<\/div>\n<div class=\"plane-desc\">Reserved for future use.<\/div>\n<\/p><\/div>\n<div class=\"plane-card\">\n<div class=\"plane-num\">PLANE 14<\/div>\n<div class=\"plane-name\">SSP<\/div>\n<div class=\"plane-desc\">Supplementary Special-purpose. Tag and selector characters.<\/div>\n<\/p><\/div>\n<div class=\"plane-card\">\n<div class=\"plane-num\">PLANES 15\u201316<\/div>\n<div class=\"plane-name\">Private Use<\/div>\n<div class=\"plane-desc\">Reserved for private agreements. Not standardized.<\/div>\n<\/p><\/div>\n<\/p><\/div>\n<div class=\"pullquote\">\n<p>&ldquo;The BMP is the ground floor of the Unicode building. Most of humanity lives here. The other 16 floors exist for historians, mathematicians, and emoji enthusiasts.&rdquo;<\/p>\n<\/p><\/div>\n<p>The <strong>Basic Multilingual Plane (BMP)<\/strong> spans <code>U+0000<\/code> to <code>U+FFFF<\/code>. It holds virtually every character used in modern writing: all Latin-script languages, Arabic, Hebrew, Devanagari, CJK basics, currency symbols, punctuation, and more.<\/p>\n<p>Anything above <code>U+FFFF<\/code> is called a <strong>supplementary character<\/strong>. Most emoji live in Plane 1. Ancient scripts like Linear B and Egyptian hieroglyphs also live there, along with all of mathematical notation.<\/p>\n<\/section>\n<p>  <!-- \u00a7 5 --><\/p>\n<section class=\"section\" id=\"encodings\">\n<div class=\"section-label\">Section 05<\/div>\n<h2>Encodings: UTF-8, UTF-16, UTF-32<\/h2>\n<p>A code point is an abstract idea. To actually store or transmit text, you need an <mark>encoding<\/mark> &mdash; a recipe that converts code point numbers into bytes. Unicode has three main encodings.<\/p>\n<h3>UTF-32: the simple one<\/h3>\n<p>UTF-32 uses exactly <strong>4 bytes per code point<\/strong>, always. Simple math, no tricks. The downside? English text is 4&times; larger than it needs to be, since every ASCII letter wastes 3 bytes of zeroes.<\/p>\n<h3>UTF-8: the clever one<\/h3>\n<p>UTF-8 is the encoding of the web. It uses <strong>1 to 4 bytes<\/strong> per code point, depending on the code point&rsquo;s value. Crucially, the 128 ASCII characters take exactly 1 byte each &mdash; so a pure-ASCII file in UTF-8 is identical to an ASCII file.<\/p>\n<div class=\"encoding-visual\" aria-label=\"UTF-8 byte structure\">\n<div class=\"enc-row\">\n<div class=\"enc-label\">1 byte<\/div>\n<div class=\"enc-bytes\">\n          <span class=\"enc-byte byte-header\">0xxxxxxx<\/span>\n        <\/div>\n<\/p><\/div>\n<div class=\"enc-row\">\n<div class=\"enc-label\">2 bytes<\/div>\n<div class=\"enc-bytes\">\n          <span class=\"enc-byte byte-header\">110xxxxx<\/span><br \/>\n          <span class=\"enc-byte byte-cont\">10xxxxxx<\/span>\n        <\/div>\n<\/p><\/div>\n<div class=\"enc-row\">\n<div class=\"enc-label\">3 bytes<\/div>\n<div class=\"enc-bytes\">\n          <span class=\"enc-byte byte-header\">1110xxxx<\/span><br \/>\n          <span class=\"enc-byte byte-cont\">10xxxxxx<\/span><br \/>\n          <span class=\"enc-byte byte-cont\">10xxxxxx<\/span>\n        <\/div>\n<\/p><\/div>\n<div class=\"enc-row\">\n<div class=\"enc-label\">4 bytes<\/div>\n<div class=\"enc-bytes\">\n          <span class=\"enc-byte byte-header\">11110xxx<\/span><br \/>\n          <span class=\"enc-byte byte-cont\">10xxxxxx<\/span><br \/>\n          <span class=\"enc-byte byte-cont\">10xxxxxx<\/span><br \/>\n          <span class=\"enc-byte byte-cont\">10xxxxxx<\/span>\n        <\/div>\n<\/p><\/div>\n<\/p><\/div>\n<p>The leading bits of each byte act as a &ldquo;road sign&rdquo; telling a decoder: <em>how many bytes to read?<\/em> A byte starting with <code>0<\/code> is a single-byte character. A byte starting with <code>110<\/code> is the start of a 2-byte sequence. Bytes starting with <code>10<\/code> are continuation bytes &mdash; never the start of a character.<\/p>\n<div class=\"code-block\">\n      <span class=\"cb-label\">example<\/span><br \/>\n<span class=\"cm\"># &#8216;A&#8217; (U+0041) \u2192 1 byte in UTF-8<\/span><br \/>\n<span class=\"num\">01000001<\/span><br \/>\n \u2514\u2500 starts with 0: single byte<\/p>\n<p><span class=\"cm\"># &#8216;\u00a9&#8217; (U+00A9) \u2192 2 bytes in UTF-8<\/span><br \/>\n<span class=\"num\">11000010  10101001<\/span><br \/>\n \u2514\u2500 starts with 110: 2-byte sequence<br \/>\n                \u2514\u2500 starts with 10: continuation byte<\/p>\n<p><span class=\"cm\"># &#8216;\ud83d\ude00&#8217; (U+1F600) \u2192 4 bytes in UTF-8<\/span><br \/>\n<span class=\"num\">11110000  10011111  10011000  10000000<\/span><br \/>\n \u2514\u2500 starts with 11110: 4-byte sequence\n    <\/div>\n<h3>UTF-16: the legacy one<\/h3>\n<p>UTF-16 uses <strong>2 bytes<\/strong> for characters in the BMP and <strong>4 bytes<\/strong> for supplementary characters. It&rsquo;s used internally by JavaScript, Java, and Windows. Its complexity &mdash; especially the surrogate pair mechanism &mdash; is the price paid for keeping BMP characters at 2 bytes.<\/p>\n<div class=\"table-wrap\">\n<table>\n<thead>\n<tr>\n<th>Encoding<\/th>\n<th>Bytes per char<\/th>\n<th>ASCII-efficient?<\/th>\n<th>Used by<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td class=\"label\">UTF-8<\/td>\n<td>1\u20134<\/td>\n<td>\u2705 Yes (1 byte)<\/td>\n<td>Web, Linux, macOS files<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">UTF-16<\/td>\n<td>2 or 4<\/td>\n<td>\u274c No (2 bytes)<\/td>\n<td>JavaScript, Java, Windows APIs<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">UTF-32<\/td>\n<td>4, always<\/td>\n<td>\u274c No (4 bytes)<\/td>\n<td>Internal processing, some databases<\/td>\n<\/tr>\n<\/tbody>\n<\/table><\/div>\n<\/section>\n<p>  <!-- \u00a7 6 --><\/p>\n<section class=\"section\" id=\"surrogates\">\n<div class=\"section-label\">Section 06<\/div>\n<h2>Surrogates: UTF-16&rsquo;s Clever Trick<\/h2>\n<p>Here&rsquo;s a puzzle: UTF-16 uses 2 bytes (16 bits) per character. That gives you 65,536 possible values. But Unicode has over a million code points. How do you squeeze a million into 65,536 slots?<\/p>\n<p>The answer is <mark>surrogate pairs<\/mark>. Unicode deliberately reserved two blocks in the BMP for this purpose:<\/p>\n<div class=\"table-wrap\">\n<table>\n<thead>\n<tr>\n<th>Block<\/th>\n<th>Range<\/th>\n<th>Role<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td class=\"label\">High Surrogates<\/td>\n<td>U+D800\u2013U+DBFF<\/td>\n<td>First half of a surrogate pair<\/td>\n<\/tr>\n<tr>\n<td class=\"label\">Low Surrogates<\/td>\n<td>U+DC00\u2013U+DFFF<\/td>\n<td>Second half of a surrogate pair<\/td>\n<\/tr>\n<\/tbody>\n<\/table><\/div>\n<p>When UTF-16 needs to encode a supplementary character (above U+FFFF), it breaks it into two 16-bit values: a <strong>high surrogate<\/strong> followed by a <strong>low surrogate<\/strong>. Together, the pair encodes the original code point. Neither half on its own is a valid character &mdash; they only work as a team.<\/p>\n<div class=\"code-block\">\n      <span class=\"cb-label\">surrogate pair math<\/span><br \/>\n<span class=\"cm\"># Encode U+1F600 (\ud83d\ude00) in UTF-16<\/span><\/p>\n<p><span class=\"kw\">Step 1:<\/span> Subtract 0x10000 from the code point<br \/>\n  0x1F600 &#8211; 0x10000 = 0xF600<\/p>\n<p><span class=\"kw\">Step 2:<\/span> Express as 20-bit binary<br \/>\n  0xF600 = 0000 1111 0110 0000 0000<\/p>\n<p><span class=\"kw\">Step 3:<\/span> Split into two 10-bit halves<br \/>\n  High 10 bits: 00 0011 1101  = 0x03D<br \/>\n  Low  10 bits: 10 0000 0000  = 0x200<\/p>\n<p><span class=\"kw\">Step 4:<\/span> Add surrogate bases<br \/>\n  High surrogate: 0xD800 + 0x03D = <span class=\"str\">0xD83D<\/span><br \/>\n  Low  surrogate: 0xDC00 + 0x200 = <span class=\"str\">0xDE00<\/span><\/p>\n<p><span class=\"cm\"># Result: \ud83d\ude00 = D83D DE00 in UTF-16<\/span>\n    <\/div>\n<div class=\"callout key\">\n<div class=\"callout-icon\">&#x26A1;<\/div>\n<p><strong>Why do surrogates matter to you?<\/strong> JavaScript strings are UTF-16 under the hood. This means <code>emoji.length<\/code> often returns 2 for a single emoji &mdash; because it&rsquo;s a surrogate pair. The string <code>\"\ud83d\ude00\"<\/code> has a <code>length<\/code> of 2 in JavaScript, not 1. This surprises developers constantly.<\/p>\n<\/p><\/div>\n<div class=\"code-block\">\n      <span class=\"cb-label\">javascript gotcha<\/span><br \/>\n<span class=\"str\">&#8220;\ud83d\ude00&#8221;<\/span>.length          <span class=\"cm\">\/\/ \u2192 2  (surprise!)<\/span><br \/>\n[&#8230;<span class=\"str\">&#8220;\ud83d\ude00&#8221;<\/span>].length      <span class=\"cm\">\/\/ \u2192 1  (spread uses code points)<\/span><br \/>\n<span class=\"str\">&#8220;\ud83d\ude00&#8221;<\/span>.codePointAt(<span class=\"num\">0<\/span>) <span class=\"cm\">\/\/ \u2192 128512  (correct code point)<\/span><br \/>\n<span class=\"str\">&#8220;\ud83d\ude00&#8221;<\/span>.charCodeAt(<span class=\"num\">0<\/span>)  <span class=\"cm\">\/\/ \u2192 55357   (high surrogate only!)<\/span>\n    <\/div>\n<\/section>\n<p>  <!-- \u00a7 7 --><\/p>\n<section class=\"section\" id=\"graphemes\">\n<div class=\"section-label\">Section 07<\/div>\n<h2>Code Points \u2260 Characters (Grapheme Clusters)<\/h2>\n<p>You might think: one code point = one visible character. Close, but not always true. Some of what we <em>see<\/em> as a single character is actually composed of <strong>multiple code points working together<\/strong>.<\/p>\n<div class=\"glyph-demo\">\n<div class=\"big-glyph\">&#x1F469;&#x200D;&#x1F4BB;<\/div>\n<div class=\"glyph-info\">\n<dl>\n<dt>Visible<\/dt>\n<dd class=\"highlight1\">1 character<\/dd>\n<dt>Code Points<\/dt>\n<dd>3 code points<\/dd>\n<dt>UTF-8 bytes<\/dt>\n<dd>11 bytes<\/dd>\n<dt>Breakdown<\/dt>\n<dd>\ud83d\udc69 + ZWJ + \ud83d\udcbb<\/dd>\n<\/dl><\/div>\n<\/p><\/div>\n<p>The &ldquo;woman technologist&rdquo; emoji is actually three code points joined by an invisible glue character called a <strong>Zero Width Joiner (ZWJ, U+200D)<\/strong>. The rendering engine sees the ZWJ and knows to merge the adjacent emoji into one image.<\/p>\n<p>These visual units are called <mark>grapheme clusters<\/mark>. A grapheme cluster can contain:<\/p>\n<ul style=\"padding-left: 24px; margin-bottom: 20px; line-height: 2;\">\n<li>A base character + combining diacritics (e.g., <code>e&#x0301;<\/code> = \u00e9, two code points)<\/li>\n<li>An emoji + skin tone modifier (e.g., &#x1F44D;&#x1F3FD; = two code points)<\/li>\n<li>A sequence of emoji joined by ZWJ (family emoji can be 7+ code points)<\/li>\n<li>A flag emoji (two Regional Indicator letters, e.g., &#x1F1FA;&#x1F1F8; = US flag)<\/li>\n<\/ul>\n<div class=\"callout warn\">\n<div class=\"callout-icon\">&#x26A0;&#xFE0F;<\/div>\n<p><strong>String length is meaningless without context<\/strong> If you need to count &ldquo;characters&rdquo; that a user sees, you need a grapheme cluster algorithm (like <code>Intl.Segmenter<\/code> in modern JavaScript). Counting bytes, code units, or even raw code points will give you the wrong answer for complex emoji.<\/p>\n<\/p><\/div>\n<\/section>\n<p>  <!-- \u00a7 8 --><\/p>\n<section class=\"section\" id=\"tldr\">\n<div class=\"section-label\">Section 08<\/div>\n<h2>TL;DR Cheat Sheet<\/h2>\n<div class=\"table-wrap\">\n<table>\n<thead>\n<tr>\n<th>Term<\/th>\n<th>Plain English<\/th>\n<th>Example<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><code>Code Point<\/code><\/td>\n<td class=\"label\">A unique number assigned to every character<\/td>\n<td><code>U+1F600<\/code><\/td>\n<\/tr>\n<tr>\n<td><code>Block<\/code><\/td>\n<td class=\"label\">A named range of related code points<\/td>\n<td>Emoticons block<\/td>\n<\/tr>\n<tr>\n<td><code>Plane<\/code><\/td>\n<td class=\"label\">One of 17 groups of 65,536 code points each<\/td>\n<td>BMP = Plane 0<\/td>\n<\/tr>\n<tr>\n<td><code>BMP<\/code><\/td>\n<td class=\"label\">The first plane; holds almost all modern text<\/td>\n<td>U+0000 \u2013 U+FFFF<\/td>\n<\/tr>\n<tr>\n<td><code>Encoding<\/code><\/td>\n<td class=\"label\">Rules for turning code points into bytes<\/td>\n<td>UTF-8, UTF-16<\/td>\n<\/tr>\n<tr>\n<td><code>UTF-8<\/code><\/td>\n<td class=\"label\">Variable-width encoding; 1\u20134 bytes; default for web<\/td>\n<td>\ud83d\ude00 = F0 9F 98 80<\/td>\n<\/tr>\n<tr>\n<td><code>Surrogate Pair<\/code><\/td>\n<td class=\"label\">Two UTF-16 values that together encode one supplementary code point<\/td>\n<td>D83D + DE00 = \ud83d\ude00<\/td>\n<\/tr>\n<tr>\n<td><code>Grapheme Cluster<\/code><\/td>\n<td class=\"label\">One visible character as perceived by a human<\/td>\n<td>\ud83d\udc69\u200d\ud83d\udcbb = 3 code points<\/td>\n<\/tr>\n<tr>\n<td><code>ZWJ<\/code><\/td>\n<td class=\"label\">Zero Width Joiner &#8211; invisible glue between emoji<\/td>\n<td>U+200D<\/td>\n<\/tr>\n<\/tbody>\n<\/table><\/div>\n<div class=\"callout key\">\n<div class=\"callout-icon\">&#x1F3AF;<\/div>\n<p><strong>The one-sentence summary:<\/strong> Unicode gives every character a unique number (code point). Encodings like UTF-8 decide how to store those numbers as bytes. The BMP holds most everyday text; emoji and historic scripts live in supplementary planes. UTF-16 uses surrogate pairs to reach those higher planes. And what you see as one character might be several code points fused together.<\/p>\n<\/p><\/div>\n<\/section>\n<p><\/main><\/p>\n","protected":false},"excerpt":{"rendered":"<p>Before Unicode: Pure Chaos, Imagine you\u2019re writing a letter in 1989. You type on a French computer, your colleague opens the file on a Japanese machine, and half the letters turn into random symbols. This happened all the time. The root cause was simple: there was no agreement on which number represents which letter. ASCII covered 128 characters \u2014 enough for English, but it left the rest of the world out in the cold. <\/p>\n","protected":false},"author":1,"featured_media":84,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[2,3],"tags":[],"class_list":["post-108","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-characters","category-emojis","wpautop"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v27.2 - https:\/\/yoast.com\/product\/yoast-seo-wordpress\/ -->\n<title>How Unicode Actually Works - CharacterCodes Blog<\/title>\n<meta name=\"description\" content=\"A beginner-friendly guide to Unicode: what code points are, how UTF-8 and UTF-16 encoding works, what surrogate pairs do, and why emoji break string length.\" \/>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"How Unicode Actually Works - CharacterCodes Blog\" \/>\n<meta property=\"og:description\" content=\"A beginner-friendly guide to Unicode: what code points are, how UTF-8 and UTF-16 encoding works, what surrogate pairs do, and why emoji break string length.\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\" \/>\n<meta property=\"og:site_name\" content=\"CharacterCodes Blog\" \/>\n<meta property=\"article:published_time\" content=\"2026-03-27T07:36:50+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2026-03-27T07:49:14+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png\" \/>\n\t<meta property=\"og:image:width\" content=\"840\" \/>\n\t<meta property=\"og:image:height\" content=\"480\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/png\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"9 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#article\",\"isPartOf\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\"},\"author\":{\"name\":\"admin\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/#\/schema\/person\/756aa07230428706f227ac4e178a7977\"},\"headline\":\"How Unicode Actually Works\",\"datePublished\":\"2026-03-27T07:36:50+00:00\",\"dateModified\":\"2026-03-27T07:49:14+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\"},\"wordCount\":1767,\"commentCount\":0,\"image\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage\"},\"thumbnailUrl\":\"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png\",\"articleSection\":[\"Characters\",\"Emojis\"],\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"CommentAction\",\"name\":\"Comment\",\"target\":[\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#respond\"]}]},{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\",\"url\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\",\"name\":\"How Unicode Actually Works - CharacterCodes Blog\",\"isPartOf\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage\"},\"image\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage\"},\"thumbnailUrl\":\"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png\",\"datePublished\":\"2026-03-27T07:36:50+00:00\",\"dateModified\":\"2026-03-27T07:49:14+00:00\",\"author\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/#\/schema\/person\/756aa07230428706f227ac4e178a7977\"},\"description\":\"A beginner-friendly guide to Unicode: what code points are, how UTF-8 and UTF-16 encoding works, what surrogate pairs do, and why emoji break string length.\",\"breadcrumb\":{\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage\",\"url\":\"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png\",\"contentUrl\":\"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png\",\"width\":840,\"height\":480,\"caption\":\"how unicode works\"},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/www.charactercodes.net\/blog\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"How Unicode Actually Works\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/#website\",\"url\":\"https:\/\/www.charactercodes.net\/blog\/\",\"name\":\"CharacterCodes Blog\",\"description\":\"Characters &amp; Emojis\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.charactercodes.net\/blog\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.charactercodes.net\/blog\/#\/schema\/person\/756aa07230428706f227ac4e178a7977\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\/\/secure.gravatar.com\/avatar\/e188a164246c600f0cec7b14aec001469d0529676269f7d0fc364bfe2da4d7c2?s=96&d=mm&r=g\",\"url\":\"https:\/\/secure.gravatar.com\/avatar\/e188a164246c600f0cec7b14aec001469d0529676269f7d0fc364bfe2da4d7c2?s=96&d=mm&r=g\",\"contentUrl\":\"https:\/\/secure.gravatar.com\/avatar\/e188a164246c600f0cec7b14aec001469d0529676269f7d0fc364bfe2da4d7c2?s=96&d=mm&r=g\",\"caption\":\"admin\"},\"sameAs\":[\"https:\/\/charactercodes.net\/blog1\"],\"url\":\"https:\/\/www.charactercodes.net\/blog\/author\/rspronk\/\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"How Unicode Actually Works - CharacterCodes Blog","description":"A beginner-friendly guide to Unicode: what code points are, how UTF-8 and UTF-16 encoding works, what surrogate pairs do, and why emoji break string length.","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/","og_locale":"en_US","og_type":"article","og_title":"How Unicode Actually Works - CharacterCodes Blog","og_description":"A beginner-friendly guide to Unicode: what code points are, how UTF-8 and UTF-16 encoding works, what surrogate pairs do, and why emoji break string length.","og_url":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/","og_site_name":"CharacterCodes Blog","article_published_time":"2026-03-27T07:36:50+00:00","article_modified_time":"2026-03-27T07:49:14+00:00","og_image":[{"width":840,"height":480,"url":"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png","type":"image\/png"}],"author":"admin","twitter_card":"summary_large_image","twitter_misc":{"Written by":"admin","Est. reading time":"9 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#article","isPartOf":{"@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/"},"author":{"name":"admin","@id":"https:\/\/www.charactercodes.net\/blog\/#\/schema\/person\/756aa07230428706f227ac4e178a7977"},"headline":"How Unicode Actually Works","datePublished":"2026-03-27T07:36:50+00:00","dateModified":"2026-03-27T07:49:14+00:00","mainEntityOfPage":{"@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/"},"wordCount":1767,"commentCount":0,"image":{"@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage"},"thumbnailUrl":"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png","articleSection":["Characters","Emojis"],"inLanguage":"en-US","potentialAction":[{"@type":"CommentAction","name":"Comment","target":["https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#respond"]}]},{"@type":"WebPage","@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/","url":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/","name":"How Unicode Actually Works - CharacterCodes Blog","isPartOf":{"@id":"https:\/\/www.charactercodes.net\/blog\/#website"},"primaryImageOfPage":{"@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage"},"image":{"@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage"},"thumbnailUrl":"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png","datePublished":"2026-03-27T07:36:50+00:00","dateModified":"2026-03-27T07:49:14+00:00","author":{"@id":"https:\/\/www.charactercodes.net\/blog\/#\/schema\/person\/756aa07230428706f227ac4e178a7977"},"description":"A beginner-friendly guide to Unicode: what code points are, how UTF-8 and UTF-16 encoding works, what surrogate pairs do, and why emoji break string length.","breadcrumb":{"@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/"]}]},{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#primaryimage","url":"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png","contentUrl":"https:\/\/www.charactercodes.net\/blog\/wp-content\/uploads\/2024\/09\/how-unicode-works.png","width":840,"height":480,"caption":"how unicode works"},{"@type":"BreadcrumbList","@id":"https:\/\/www.charactercodes.net\/blog\/how-unicode-actually-works\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/www.charactercodes.net\/blog\/"},{"@type":"ListItem","position":2,"name":"How Unicode Actually Works"}]},{"@type":"WebSite","@id":"https:\/\/www.charactercodes.net\/blog\/#website","url":"https:\/\/www.charactercodes.net\/blog\/","name":"CharacterCodes Blog","description":"Characters &amp; Emojis","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.charactercodes.net\/blog\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":"Person","@id":"https:\/\/www.charactercodes.net\/blog\/#\/schema\/person\/756aa07230428706f227ac4e178a7977","name":"admin","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/secure.gravatar.com\/avatar\/e188a164246c600f0cec7b14aec001469d0529676269f7d0fc364bfe2da4d7c2?s=96&d=mm&r=g","url":"https:\/\/secure.gravatar.com\/avatar\/e188a164246c600f0cec7b14aec001469d0529676269f7d0fc364bfe2da4d7c2?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/e188a164246c600f0cec7b14aec001469d0529676269f7d0fc364bfe2da4d7c2?s=96&d=mm&r=g","caption":"admin"},"sameAs":["https:\/\/charactercodes.net\/blog1"],"url":"https:\/\/www.charactercodes.net\/blog\/author\/rspronk\/"}]}},"_links":{"self":[{"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/posts\/108","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/comments?post=108"}],"version-history":[{"count":2,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/posts\/108\/revisions"}],"predecessor-version":[{"id":110,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/posts\/108\/revisions\/110"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/media\/84"}],"wp:attachment":[{"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/media?parent=108"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/categories?post=108"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.charactercodes.net\/blog\/wp-json\/wp\/v2\/tags?post=108"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}