experimental

2025-12-18 18:25:28 +01:00
parent 05e94ac655
commit d47ee0d1ef
9 changed files with 5025 additions and 1982 deletions
@@ -42,6 +42,16 @@ ttf-parser = "0.25"
 fontconfig = "0.10"
 fontconfig-sys = { package = "yeslogic-fontconfig-sys", version = "6.0" }

+# Color emoji support (FreeType + Cairo for color font rendering)
+freetype-rs = "0.38"
+cairo-rs = { version = "0.21", features = ["freetype"] }
+
+# Emoji detection (Unicode emoji database for O(1) lookup)
+emojis = "0.8"
+
+# Unicode character width (UAX#11 East Asian Width)
+unicode-width = "0.2"
+
 # Configuration
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
@@ -102,8 +102,20 @@ fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
        return in.bg_color;
    }
    
-    // Sample the glyph alpha from the atlas
-    let glyph_alpha = textureSample(atlas_texture, atlas_sampler, in.uv).r;
+    // Sample from RGBA atlas
+    let glyph_sample = textureSample(atlas_texture, atlas_sampler, in.uv);
+    
+    // Detect color glyphs: regular glyphs are stored as white (1,1,1) with alpha
+    // Color glyphs have actual RGB colors. Check if any RGB channel is not white.
+    let is_color_glyph = glyph_sample.r < 0.99 || glyph_sample.g < 0.99 || glyph_sample.b < 0.99;
+    
+    if is_color_glyph {
+        // Color glyph (emoji) - use atlas color directly
+        return glyph_sample;
+    }
+    
+    // Regular glyph - use alpha with foreground color
+    let glyph_alpha = glyph_sample.a;
    
    // Apply legacy gamma-incorrect blending for crisp text
    let adjusted_alpha = foreground_contrast_legacy(in.color.rgb, glyph_alpha, in.bg_color.rgb);
@@ -133,15 +145,22 @@ struct GridParams {
    // Screen dimensions in pixels
    screen_width: f32,
    screen_height: f32,
-    // Y offset for tab bar
+    // X offset for pane position
+    x_offset: f32,
+    // Y offset for tab bar + pane position
    y_offset: f32,
    // Cursor position (-1 if hidden)
    cursor_col: i32,
    cursor_row: i32,
    // Cursor style: 0=block, 1=underline, 2=bar
    cursor_style: u32,
-    // Padding
-    _padding: vec2<u32>,
+    // Background opacity for transparency (0.0 = transparent, 1.0 = opaque)
+    background_opacity: f32,
+    // Selection range (-1 values mean no selection)
+    selection_start_col: i32,
+    selection_start_row: i32,
+    selection_end_col: i32,
+    selection_end_row: i32,
 }

 // GPUCell instance data (matches Rust GPUCell struct)
@@ -154,12 +173,14 @@ struct GPUCell {
 }

 // Sprite info for glyph positioning
+// In Kitty's model, sprites are always cell-sized and glyphs are pre-positioned
+// within the sprite at the correct baseline. No offset math needed.
 struct SpriteInfo {
    // UV coordinates in atlas (x, y, width, height) - normalized 0-1
    uv: vec4<f32>,
-    // Offset from cell origin (x, y) in pixels
-    offset: vec2<f32>,
-    // Size in pixels
+    // Padding (previously offset, now unused)
+    _padding: vec2<f32>,
+    // Size in pixels (width, height) - always matches cell dimensions
    size: vec2<f32>,
 }

@@ -188,10 +209,16 @@ const ATTR_ITALIC_BIT: u32 = 0x10u;
 const ATTR_REVERSE_BIT: u32 = 0x20u;
 const ATTR_STRIKE_BIT: u32 = 0x40u;
 const ATTR_DIM_BIT: u32 = 0x80u;
+const ATTR_SELECTED_BIT: u32 = 0x100u;

 // Colored glyph flag
 const COLORED_GLYPH_FLAG: u32 = 0x80000000u;

+// Cursor shape constants
+const CURSOR_BLOCK: u32 = 0u;
+const CURSOR_UNDERLINE: u32 = 1u;
+const CURSOR_BAR: u32 = 2u;
+
 // Vertex output for instanced cell rendering
 struct CellVertexOutput {
    @builtin(position) clip_position: vec4<f32>,
@@ -200,14 +227,20 @@ struct CellVertexOutput {
    @location(2) bg_color: vec4<f32>,
    @location(3) @interpolate(flat) is_background: u32,
    @location(4) @interpolate(flat) is_colored_glyph: u32,
+    @location(5) @interpolate(flat) is_cursor: u32,
+    @location(6) @interpolate(flat) cursor_shape: u32,
+    @location(7) cursor_color: vec4<f32>,
+    @location(8) cell_pos: vec2<f32>,      // Cell top-left position in pixels
+    @location(9) @interpolate(flat) cell_size: vec2<f32>,  // Cell width/height in pixels
 }

-// Resolve a packed color to RGBA
+// Resolve a packed color to RGBA (in linear space for GPU rendering)
 fn resolve_color(packed: u32, is_foreground: bool) -> vec4<f32> {
    let color_type = packed & 0xFFu;
    
    if color_type == COLOR_TYPE_DEFAULT {
        // Default color - use color table entry 256 (fg) or 257 (bg)
+        // Color table is already in linear space
        if is_foreground {
            return color_table.colors[256];
        } else {
@@ -215,14 +248,15 @@ fn resolve_color(packed: u32, is_foreground: bool) -> vec4<f32> {
        }
    } else if color_type == COLOR_TYPE_INDEXED {
        // Indexed color - look up in color table
+        // Color table is already in linear space
        let index = (packed >> 8u) & 0xFFu;
        return color_table.colors[index];
    } else {
-        // RGB color - extract components
+        // RGB color - extract components and convert sRGB to linear
        let r = f32((packed >> 8u) & 0xFFu) / 255.0;
        let g = f32((packed >> 16u) & 0xFFu) / 255.0;
        let b = f32((packed >> 24u) & 0xFFu) / 255.0;
-        return vec4<f32>(r, g, b, 1.0);
+        return vec4<f32>(srgb_to_linear(r), srgb_to_linear(g), srgb_to_linear(b), 1.0);
    }
 }

@@ -265,15 +299,16 @@ fn vs_cell_bg(
    let cell = cells[instance_index];
    
    // Calculate cell pixel position
-    let cell_x = f32(col) * grid_params.cell_width;
+    let cell_x = grid_params.x_offset + f32(col) * grid_params.cell_width;
    let cell_y = grid_params.y_offset + f32(row) * grid_params.cell_height;
    
-    // Quad vertex positions (0=top-left, 1=top-right, 2=bottom-right, 3=bottom-left)
+    // Quad vertex positions for TriangleStrip (0=top-left, 1=top-right, 2=bottom-left, 3=bottom-right)
+    // TriangleStrip produces triangles: (0,1,2) and (1,2,3)
    var positions: array<vec2<f32>, 4>;
-    positions[0] = vec2<f32>(cell_x, cell_y);
-    positions[1] = vec2<f32>(cell_x + grid_params.cell_width, cell_y);
-    positions[2] = vec2<f32>(cell_x + grid_params.cell_width, cell_y + grid_params.cell_height);
-    positions[3] = vec2<f32>(cell_x, cell_y + grid_params.cell_height);
+    positions[0] = vec2<f32>(cell_x, cell_y);                                              // top-left
+    positions[1] = vec2<f32>(cell_x + grid_params.cell_width, cell_y);                     // top-right
+    positions[2] = vec2<f32>(cell_x, cell_y + grid_params.cell_height);                    // bottom-left
+    positions[3] = vec2<f32>(cell_x + grid_params.cell_width, cell_y + grid_params.cell_height); // bottom-right
    
    let screen_size = vec2<f32>(grid_params.screen_width, grid_params.screen_height);
    let ndc_pos = pixel_to_ndc(positions[vertex_index], screen_size);
@@ -292,7 +327,29 @@ fn vs_cell_bg(
        bg = tmp;
    }
    
-    // Keep colors in sRGB space for legacy blending
+    // Check if this cell is selected (per-cell flag set by CPU, respects xlimit)
+    let is_selected = (attrs & ATTR_SELECTED_BIT) != 0u;
+    if is_selected {
+        fg = vec4<f32>(0.0, 0.0, 0.0, 1.0);  // Black foreground
+        bg = vec4<f32>(1.0, 1.0, 1.0, 1.0);  // White background
+    }
+    
+    // Check if this cell is the cursor
+    let is_cursor_cell = (i32(col) == grid_params.cursor_col) && (i32(row) == grid_params.cursor_row);
+    
+    // For default background (type 0), use fully transparent so the window's
+    // clear color (which has background_opacity applied) shows through.
+    // Only non-default backgrounds should be opaque.
+    // But NOT if the cell is selected (selection always has white bg)
+    let bg_type = cell.bg & 0xFFu;
+    if bg_type == COLOR_TYPE_DEFAULT && !is_reverse && !is_selected {
+        bg.a = 0.0;
+    }
+    
+    // Calculate cursor color - use fg color (inverted from bg) for visibility
+    // For block cursor, we'll use fg as the cursor background
+    var cursor_color = fg;
+    cursor_color.a = 1.0;
    
    var out: CellVertexOutput;
    out.clip_position = vec4<f32>(ndc_pos, 0.0, 1.0);
@@ -301,6 +358,11 @@ fn vs_cell_bg(
    out.bg_color = bg;
    out.is_background = 1u;
    out.is_colored_glyph = 0u;
+    out.is_cursor = select(0u, 1u, is_cursor_cell);
+    out.cursor_shape = grid_params.cursor_style;
+    out.cursor_color = cursor_color;
+    out.cell_pos = vec2<f32>(cell_x, cell_y);
+    out.cell_size = vec2<f32>(grid_params.cell_width, grid_params.cell_height);
    
    return out;
 }
@@ -344,27 +406,27 @@ fn vs_cell_glyph(
    }
    
    // Calculate cell pixel position
-    let cell_x = f32(col) * grid_params.cell_width;
+    let cell_x = grid_params.x_offset + f32(col) * grid_params.cell_width;
    let cell_y = grid_params.y_offset + f32(row) * grid_params.cell_height;
    
-    // Calculate glyph position (baseline-relative)
-    let baseline_y = cell_y + grid_params.cell_height * 0.8;
-    let glyph_x = cell_x + sprite.offset.x;
-    let glyph_y = baseline_y - sprite.offset.y - sprite.size.y;
+    // Kitty model: sprites are cell-sized with glyphs pre-positioned at baseline.
+    // Just map the sprite directly to the cell.
+    let glyph_x = cell_x;
+    let glyph_y = cell_y;
    
-    // Quad vertex positions
+    // Quad vertex positions for TriangleStrip (0=top-left, 1=top-right, 2=bottom-left, 3=bottom-right)
    var positions: array<vec2<f32>, 4>;
-    positions[0] = vec2<f32>(glyph_x, glyph_y);
-    positions[1] = vec2<f32>(glyph_x + sprite.size.x, glyph_y);
-    positions[2] = vec2<f32>(glyph_x + sprite.size.x, glyph_y + sprite.size.y);
-    positions[3] = vec2<f32>(glyph_x, glyph_y + sprite.size.y);
+    positions[0] = vec2<f32>(glyph_x, glyph_y);                                  // top-left
+    positions[1] = vec2<f32>(glyph_x + sprite.size.x, glyph_y);                  // top-right
+    positions[2] = vec2<f32>(glyph_x, glyph_y + sprite.size.y);                  // bottom-left
+    positions[3] = vec2<f32>(glyph_x + sprite.size.x, glyph_y + sprite.size.y);  // bottom-right
    
-    // UV coordinates
+    // UV coordinates (matching vertex positions)
    var uvs: array<vec2<f32>, 4>;
-    uvs[0] = vec2<f32>(sprite.uv.x, sprite.uv.y);
-    uvs[1] = vec2<f32>(sprite.uv.x + sprite.uv.z, sprite.uv.y);
-    uvs[2] = vec2<f32>(sprite.uv.x + sprite.uv.z, sprite.uv.y + sprite.uv.w);
-    uvs[3] = vec2<f32>(sprite.uv.x, sprite.uv.y + sprite.uv.w);
+    uvs[0] = vec2<f32>(sprite.uv.x, sprite.uv.y);                                // top-left
+    uvs[1] = vec2<f32>(sprite.uv.x + sprite.uv.z, sprite.uv.y);                  // top-right
+    uvs[2] = vec2<f32>(sprite.uv.x, sprite.uv.y + sprite.uv.w);                  // bottom-left
+    uvs[3] = vec2<f32>(sprite.uv.x + sprite.uv.z, sprite.uv.y + sprite.uv.w);    // bottom-right
    
    let screen_size = vec2<f32>(grid_params.screen_width, grid_params.screen_height);
    let ndc_pos = pixel_to_ndc(positions[vertex_index], screen_size);
@@ -382,7 +444,22 @@ fn vs_cell_glyph(
        bg = tmp;
    }
    
-    // Keep colors in sRGB space for legacy blending (conversion happens in fragment shader)
+    // Check if this cell is selected (per-cell flag set by CPU, respects xlimit)
+    let is_selected = (attrs & ATTR_SELECTED_BIT) != 0u;
+    if is_selected {
+        fg = vec4<f32>(0.0, 0.0, 0.0, 1.0);  // Black foreground
+        bg = vec4<f32>(1.0, 1.0, 1.0, 1.0);  // White background
+    }
+    
+    // Check if this cell is the cursor
+    let is_cursor_cell = (i32(col) == grid_params.cursor_col) && (i32(row) == grid_params.cursor_row);
+    
+    // For block cursor, invert text color (use bg as fg)
+    var cursor_text_color = bg;
+    cursor_text_color.a = 1.0;
+    if is_cursor_cell && grid_params.cursor_style == CURSOR_BLOCK {
+        fg = cursor_text_color;
+    }
    
    var out: CellVertexOutput;
    out.clip_position = vec4<f32>(ndc_pos, 0.0, 1.0);
@@ -391,6 +468,11 @@ fn vs_cell_glyph(
    out.bg_color = bg;  // Pass background for legacy gamma blending
    out.is_background = 0u;
    out.is_colored_glyph = select(0u, 1u, is_colored);
+    out.is_cursor = select(0u, 1u, is_cursor_cell);
+    out.cursor_shape = grid_params.cursor_style;
+    out.cursor_color = cursor_text_color;
+    out.cell_pos = vec2<f32>(cell_x, cell_y);
+    out.cell_size = vec2<f32>(grid_params.cell_width, grid_params.cell_height);
    
    return out;
 }
@@ -399,20 +481,47 @@ fn vs_cell_glyph(
@fragment
 fn fs_cell(in: CellVertexOutput) -> @location(0) vec4<f32> {
    if in.is_background == 1u {
-        // Background - just output the bg color
+        // Check if this is a cursor cell
+        if in.is_cursor == 1u {
+            // Calculate fragment position relative to cell
+            let frag_pos = in.clip_position.xy;
+            let cell_local = frag_pos - in.cell_pos;
+            
+            if in.cursor_shape == CURSOR_BLOCK {
+                // Block cursor - fill entire cell with cursor color
+                return in.cursor_color;
+            } else if in.cursor_shape == CURSOR_UNDERLINE {
+                // Underline cursor - bottom 10% or at least 2 pixels
+                let underline_height = max(2.0, in.cell_size.y * 0.1);
+                if cell_local.y >= in.cell_size.y - underline_height {
+                    return in.cursor_color;
+                }
+            } else if in.cursor_shape == CURSOR_BAR {
+                // Bar cursor - left 10% or at least 2 pixels
+                let bar_width = max(2.0, in.cell_size.x * 0.1);
+                if cell_local.x < bar_width {
+                    return in.cursor_color;
+                }
+            }
+        }
+        
+        // Normal background - just output the bg color
        return in.bg_color;
    }
    
-    // Glyph - sample from atlas
-    let glyph_alpha = textureSample(atlas_texture, atlas_sampler, in.uv).r;
+    // Glyph - sample from RGBA atlas
+    let glyph_sample = textureSample(atlas_texture, atlas_sampler, in.uv);
    
    if in.is_colored_glyph == 1u {
-        // Colored glyph (emoji) - use atlas color directly
-        // Note: For now we just use alpha since our atlas is single-channel
-        // Full emoji support would need an RGBA atlas
-        return vec4<f32>(in.fg_color.rgb, glyph_alpha);
+        // Colored glyph (emoji) - use atlas color directly with premultiplied alpha blending
+        // The atlas stores RGBA color from the emoji font
+        return glyph_sample;
    }
    
+    // Regular glyph - atlas stores white (1,1,1) with alpha in A channel
+    // Use the alpha channel for text rendering
+    let glyph_alpha = glyph_sample.a;
+    
    // Apply legacy gamma-incorrect blending for crisp text
    let adjusted_alpha = foreground_contrast_legacy(in.fg_color.rgb, glyph_alpha, in.bg_color.rgb);
    
@@ -9,6 +9,7 @@ use std::collections::HashMap;
 use std::io::{Cursor, Read};
 use std::time::Instant;

+use base64::Engine;
 use flate2::read::ZlibDecoder;
 use image::{codecs::gif::GifDecoder, AnimationDecoder, ImageFormat};

@@ -374,14 +375,14 @@ impl GraphicsCommand {

    /// Convert RGB payload to RGBA.
    pub fn rgb_to_rgba(&self) -> Vec<u8> {
-        let mut rgba = Vec::with_capacity(self.payload.len() * 4 / 3);
-        for chunk in self.payload.chunks(3) {
-            if chunk.len() == 3 {
-                rgba.push(chunk[0]);
-                rgba.push(chunk[1]);
-                rgba.push(chunk[2]);
-                rgba.push(255);
-            }
+        let num_pixels = self.payload.len() / 3;
+        let mut rgba = Vec::with_capacity(num_pixels * 4);
+        // Use chunks_exact for better optimization - no bounds check in the loop
+        for chunk in self.payload.chunks_exact(3) {
+            rgba.push(chunk[0]);
+            rgba.push(chunk[1]);
+            rgba.push(chunk[2]);
+            rgba.push(255);
        }
        rgba
    }
@@ -680,12 +681,29 @@ pub struct ImageData {
    pub width: u32,
    /// Image height in pixels.
    pub height: u32,
-    /// RGBA pixel data (current frame for animated images).
+    /// RGBA pixel data (base frame for static images, or root frame for animations).
+    /// For animated images, use `current_frame_data()` to get the current frame.
    pub data: Vec<u8>,
    /// Animation data if this is an animated image.
    pub animation: Option<AnimationData>,
 }

+impl ImageData {
+    /// Get the current frame data for display.
+    /// For animated images, returns the current animation frame.
+    /// For static images, returns the base data.
+    /// This avoids cloning by returning a reference.
+    #[inline]
+    pub fn current_frame_data(&self) -> &[u8] {
+        if let Some(ref anim) = self.animation {
+            if anim.current_frame < anim.frames.len() {
+                return &anim.frames[anim.current_frame].data;
+            }
+        }
+        &self.data
+    }
+}
+
 /// Animation state for playback control.
 #[derive(Clone, Debug, PartialEq, Eq, Default)]
 pub enum AnimationState {
@@ -779,6 +797,8 @@ pub struct ImageStorage {
    placements: Vec<ImagePlacement>,
    /// Buffer for chunked transmissions (image_id -> accumulated data).
    chunk_buffer: HashMap<u32, ChunkBuffer>,
+    /// Current image ID for ongoing chunked transfer (subsequent chunks may omit the ID).
+    current_chunked_id: Option<u32>,
    /// Next auto-generated image ID.
    next_id: u32,
    /// Flag indicating images have changed and need re-upload to GPU.
@@ -799,6 +819,7 @@ impl ImageStorage {
            images: HashMap::new(),
            placements: Vec::new(),
            chunk_buffer: HashMap::new(),
+            current_chunked_id: None,
            next_id: 1,
            dirty: false,
        }
@@ -816,7 +837,14 @@ impl ImageStorage {
    ) -> (Option<String>, Option<PlacementResult>) {
        // Handle chunked transfer
        if cmd.more_chunks {
-            let id = cmd.image_id.unwrap_or(0);
+            // Use explicit image_id if provided, otherwise use the current chunked transfer ID
+            let id = cmd.image_id.or(self.current_chunked_id).unwrap_or(0);
+            
+            // If this chunk has an explicit ID, it starts a new chunked transfer
+            if cmd.image_id.is_some() {
+                self.current_chunked_id = cmd.image_id;
+            }
+            
            let buffer = self.chunk_buffer.entry(id).or_default();
            buffer.data.extend_from_slice(&cmd.payload);
            if buffer.command.is_none() {
@@ -826,7 +854,12 @@ impl ImageStorage {
        }

        // Check if this completes a chunked transfer
-        let id = cmd.image_id.unwrap_or(0);
+        // Use explicit image_id if provided, otherwise use the current chunked transfer ID
+        let id = cmd.image_id.or(self.current_chunked_id).unwrap_or(0);
+        
+        // Clear the current chunked transfer ID since we're completing it
+        self.current_chunked_id = None;
+        
        if let Some(mut buffer) = self.chunk_buffer.remove(&id) {
            buffer.data.extend_from_slice(&cmd.payload);
            if let Some(mut buffered_cmd) = buffer.command {
@@ -1336,8 +1369,7 @@ impl ImageStorage {
            if let Some(frame_num) = cmd.base_frame {
                if frame_num > 0 && (frame_num as usize) <= anim.frames.len() {
                    anim.current_frame = frame_num as usize - 1; // 1-indexed to 0-indexed
-                    // Update image data to show this frame
-                    image.data = anim.frames[anim.current_frame].data.clone();
+                    // No need to clone - renderer uses current_frame_data()
                    anim.frame_start = None; // Reset timing
                    log::debug!("Animation {} jumped to frame {}", id, frame_num);
                }
@@ -1482,11 +1514,27 @@ impl ImageStorage {
                Format::Rgba => {
                    let w = cmd.width.ok_or(GraphicsError::MissingDimensions)?;
                    let h = cmd.height.ok_or(GraphicsError::MissingDimensions)?;
+                    let expected_size = (w * h * 4) as usize;
+                    if cmd.payload.len() != expected_size {
+                        log::warn!(
+                            "RGBA image size mismatch: declared {}x{} ({} bytes expected), got {} bytes",
+                            w, h, expected_size, cmd.payload.len()
+                        );
+                        return Err(GraphicsError::InvalidData);
+                    }
                    (w, h, cmd.payload.clone(), None)
                }
                Format::Rgb => {
                    let w = cmd.width.ok_or(GraphicsError::MissingDimensions)?;
                    let h = cmd.height.ok_or(GraphicsError::MissingDimensions)?;
+                    let expected_size = (w * h * 3) as usize;
+                    if cmd.payload.len() != expected_size {
+                        log::warn!(
+                            "RGB image size mismatch: declared {}x{} ({} bytes expected), got {} bytes",
+                            w, h, expected_size, cmd.payload.len()
+                        );
+                        return Err(GraphicsError::InvalidData);
+                    }
                    (w, h, cmd.rgb_to_rgba(), None)
                }
                Format::Gif => decode_gif(&cmd.payload)?,
@@ -1727,8 +1775,8 @@ impl ImageStorage {
                    log::debug!("Animation {} frame {} -> {} (elapsed {}ms >= {}ms)", 
                        id, old_frame, anim.current_frame, elapsed, current_frame_duration);

-                    // Update the image data with the new frame
-                    image.data = anim.frames[anim.current_frame].data.clone();
+                    // Just update frame index - no data clone needed!
+                    // The renderer will use current_frame_data() to get the right frame.
                    anim.frame_start = Some(now);
                    changed.push(*id);
                }
@@ -1758,43 +1806,14 @@ impl ImageStorage {
    }
 }

-/// Simple base64 decoder.
+/// Decode base64 data using the optimized base64 crate.
+/// This is faster than a custom implementation and handles whitespace automatically
+/// when using the STANDARD_NO_PAD engine with lenient decoding.
 fn base64_decode(input: &str) -> Result<Vec<u8>, GraphicsError> {
-    const DECODE_TABLE: [i8; 256] = {
-        let mut table = [-1i8; 256];
-        let chars =
-            b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-        let mut i = 0;
-        while i < 64 {
-            table[chars[i] as usize] = i as i8;
-            i += 1;
-        }
-        table
-    };
-
-    let input = input.as_bytes();
-    let mut output = Vec::with_capacity(input.len() * 3 / 4);
-    let mut buffer = 0u32;
-    let mut bits = 0;
-
-    for &byte in input {
-        if byte == b'=' || byte == b'\n' || byte == b'\r' || byte == b' ' {
-            continue;
-        }
-        let value = DECODE_TABLE[byte as usize];
-        if value < 0 {
-            return Err(GraphicsError::Base64DecodeFailed);
-        }
-        buffer = (buffer << 6) | (value as u32);
-        bits += 6;
-        if bits >= 8 {
-            bits -= 8;
-            output.push((buffer >> bits) as u8);
-            buffer &= (1 << bits) - 1;
-        }
-    }
-
-    Ok(output)
+    // Use standard base64 with lenient decoding (ignores whitespace, handles missing padding)
+    base64::engine::general_purpose::STANDARD
+        .decode(input.as_bytes())
+        .map_err(|_| GraphicsError::Base64DecodeFailed)
 }

 #[cfg(test)]
@@ -27,102 +27,202 @@ use winit::keyboard::{Key, NamedKey};
 use winit::platform::wayland::EventLoopBuilderExtWayland;
 use winit::window::{Window, WindowId};

-/// Kitty-style shared buffer for PTY I/O using double-buffering.
+/// Kitty-style single-buffer for PTY I/O with zero-copy reads and writes.
 /// 
-/// Uses two buffers that swap roles:
-/// - I/O thread writes to the "write" buffer
-/// - Main thread parses from the "read" buffer  
-/// - On `swap()`, the buffers exchange roles
+/// Uses a single buffer with separate read/write regions:
+/// - I/O thread writes to `buf[write_offset..]` 
+/// - Main thread reads from `buf[0..read_len]`
+/// - After main thread consumes data, buffer compacts via memmove
+/// 
+/// When buffer is full, I/O thread waits on an eventfd. Main thread signals
+/// the eventfd after consuming data to wake up the I/O thread.
 /// 
 /// This gives us:
-/// - Zero-copy parsing (main thread reads directly from buffer)
-/// - No lock contention during parsing (each thread has its own buffer)
-/// - No memmove needed
-const PTY_BUF_SIZE: usize = 4 * 1024 * 1024; // 4MB like Kitty
+/// - Zero-copy writes (I/O reads directly into buffer)
+/// - Zero-copy reads (main thread gets slice, no allocation)
+/// - Single 1MB buffer (vs 8MB for double-buffering)
+/// - No busy-waiting when buffer is full
+const PTY_BUF_SIZE: usize = 1024 * 1024; // 1MB like Kitty

 struct SharedPtyBuffer {
-    inner: Mutex<DoubleBuffer>,
+    /// The actual buffer. UnsafeCell because we need disjoint mutable access:
+    /// I/O thread writes to [write_pending..], main thread reads [0..read_available]
+    buf: std::cell::UnsafeCell<Box<[u8; PTY_BUF_SIZE]>>,
+    /// Metadata protected by mutex - offsets into the buffer
+    state: Mutex<BufferState>,
+    /// Eventfd to wake up I/O thread when space becomes available
+    wakeup_fd: i32,
 }

-struct DoubleBuffer {
-    /// Two buffers that swap roles
-    bufs: [Vec<u8>; 2],
-    /// Which buffer the I/O thread writes to (0 or 1)
-    write_idx: usize,
-    /// How many bytes are pending in the write buffer
-    write_len: usize,
+// SAFETY: We ensure disjoint access - I/O thread only writes past read_available,
+// main thread only reads up to read_available. Mutex protects metadata updates.
+unsafe impl Sync for SharedPtyBuffer {}
+unsafe impl Send for SharedPtyBuffer {}
+
+struct BufferState {
+    /// Bytes available for main thread to read (I/O has written, main hasn't consumed)
+    read_available: usize,
+    /// Bytes written by I/O thread but not yet made available to main thread
+    write_pending: usize,
+    /// Whether the I/O thread is waiting for space
+    waiting_for_space: bool,
 }

 impl SharedPtyBuffer {
    fn new() -> Self {
-        // Use with_capacity to avoid zeroing memory - we only need the allocation
-        let mut buf1 = Vec::with_capacity(PTY_BUF_SIZE);
-        let mut buf2 = Vec::with_capacity(PTY_BUF_SIZE);
-        // SAFETY: We're setting length to capacity. The data is uninitialized but
-        // we only read from portions that have been written to (tracked by write_len).
-        unsafe {
-            buf1.set_len(PTY_BUF_SIZE);
-            buf2.set_len(PTY_BUF_SIZE);
+        // Create eventfd for wakeup signaling
+        let wakeup_fd = unsafe { libc::eventfd(0, libc::EFD_NONBLOCK | libc::EFD_CLOEXEC) };
+        if wakeup_fd < 0 {
+            panic!("Failed to create eventfd: {}", std::io::Error::last_os_error());
        }
+        
        Self {
-            inner: Mutex::new(DoubleBuffer {
-                bufs: [buf1, buf2],
-                write_idx: 0,
-                write_len: 0,
+            buf: std::cell::UnsafeCell::new(Box::new([0u8; PTY_BUF_SIZE])),
+            state: Mutex::new(BufferState {
+                read_available: 0,
+                write_pending: 0,
+                waiting_for_space: false,
            }),
+            wakeup_fd,
        }
    }
    
-    /// Read from PTY fd into the write buffer. Called by I/O thread.
-    /// Returns number of bytes read, 0 if no space/would block, -1 on error.
-    fn read_from_fd(&self, fd: i32) -> isize {
-        let mut inner = self.inner.lock().unwrap();
+    /// Get the wakeup fd for the I/O thread to poll on.
+    fn wakeup_fd(&self) -> i32 {
+        self.wakeup_fd
+    }
+    
+    /// Check if there's space and mark as waiting if not.
+    /// Returns true if there's space, false if waiting.
+    fn check_space_or_wait(&self) -> bool {
+        let mut state = self.state.lock().unwrap();
+        let has_space = state.read_available + state.write_pending < PTY_BUF_SIZE;
+        if !has_space {
+            state.waiting_for_space = true;
+        }
+        has_space
+    }
+    
+    /// Get a write buffer for the I/O thread to read PTY data into.
+    /// Returns (pointer, available_space). Caller must call commit_write() after.
+    /// 
+    /// SAFETY: The returned pointer is valid until commit_write() is called.
+    /// Only one thread should call this at a time (the I/O thread).
+    fn create_write_buffer(&self) -> (*mut u8, usize) {
+        let state = self.state.lock().unwrap();
+        let write_offset = state.read_available + state.write_pending;
+        let available = PTY_BUF_SIZE.saturating_sub(write_offset);
        
-        let available = PTY_BUF_SIZE.saturating_sub(inner.write_len);
        if available == 0 {
-            return 0; // Buffer full, need swap
+            return (std::ptr::null_mut(), 0);
        }
        
-        let write_idx = inner.write_idx;
-        let write_len = inner.write_len;
-        let buf_ptr = unsafe { inner.bufs[write_idx].as_mut_ptr().add(write_len) };
+        // SAFETY: We have exclusive write access to buf[write_offset..] because:
+        // - Main thread only reads [0..read_available]
+        // - We're the only writer past read_available + write_pending
+        let ptr = unsafe { (*self.buf.get()).as_mut_ptr().add(write_offset) };
+        (ptr, available)
+    }
+    
+    /// Commit bytes written by the I/O thread.
+    fn commit_write(&self, len: usize) {
+        let mut state = self.state.lock().unwrap();
+        state.write_pending += len;
+    }
+    
+    /// Read from PTY fd into the buffer. Called by I/O thread.
+    /// Returns number of bytes read, 0 if no space/would block, -1 on error.
+    fn read_from_fd(&self, fd: i32) -> isize {
+        let (ptr, available) = self.create_write_buffer();
+        if available == 0 {
+            return 0; // Buffer full
+        }
        
        let result = unsafe { 
-            libc::read(fd, buf_ptr as *mut libc::c_void, available) 
+            libc::read(fd, ptr as *mut libc::c_void, available) 
        };
        
        if result > 0 {
-            inner.write_len += result as usize;
+            self.commit_write(result as usize);
        }
        result
    }
    
-    /// Check if there's space in the write buffer.
-    fn has_space(&self) -> bool {
-        let inner = self.inner.lock().unwrap();
-        inner.write_len < PTY_BUF_SIZE
+    /// Drain the wakeup eventfd. Called by I/O thread after waking up.
+    fn drain_wakeup(&self) {
+        let mut buf = 0u64;
+        unsafe {
+            libc::read(self.wakeup_fd, &mut buf as *mut u64 as *mut libc::c_void, 8);
+        }
    }
    
-    /// Swap buffers and return data to parse. Called by main thread.
-    /// The I/O thread will start writing to the other buffer.
-    fn take_pending(&self) -> Vec<u8> {
-        let mut inner = self.inner.lock().unwrap();
+    /// Make pending writes available for reading, get slice to read.
+    /// Returns None if no data available.
+    /// 
+    /// SAFETY: The returned slice is valid until consume() is called.
+    /// Only the main thread should call this.
+    fn get_read_slice(&self) -> Option<&[u8]> {
+        let mut state = self.state.lock().unwrap();
        
-        if inner.write_len == 0 {
-            return Vec::new(); // Nothing new to parse
+        // Move pending writes to readable
+        state.read_available += state.write_pending;
+        state.write_pending = 0;
+        
+        if state.read_available == 0 {
+            return None;
        }
        
-        // Swap: the write buffer becomes the read buffer
-        let read_idx = inner.write_idx;
-        let read_len = inner.write_len;
+        // SAFETY: We have exclusive read access to [0..read_available] because:
+        // - I/O thread only writes past read_available
+        // - We're the only reader
+        let slice = unsafe { 
+            std::slice::from_raw_parts((*self.buf.get()).as_ptr(), state.read_available) 
+        };
+        Some(slice)
+    }
    
-        // Switch I/O thread to the other buffer
-        inner.write_idx = 1 - inner.write_idx;
-        inner.write_len = 0;
+    /// Consume all read data, making space for new writes.
+    /// Called after parsing is complete. Wakes up I/O thread if it was waiting.
+    fn consume_all(&self) {
+        let should_wakeup;
+        {
+            let mut state = self.state.lock().unwrap();
            
-        // Return a copy of the data to parse
-        // (We have to copy because we can't return a reference with the mutex)
-        inner.bufs[read_idx][..read_len].to_vec()
+            // If there's pending write data, we need to move it to the front
+            if state.write_pending > 0 {
+                // SAFETY: Memmove handles overlapping regions
+                unsafe {
+                    let buf = &mut *self.buf.get();
+                    std::ptr::copy(
+                        buf.as_ptr().add(state.read_available),
+                        buf.as_mut_ptr(),
+                        state.write_pending,
+                    );
+                }
+            }
+            
+            state.read_available = 0;
+            // write_pending stays the same but is now at offset 0
+            
+            should_wakeup = state.waiting_for_space;
+            state.waiting_for_space = false;
+        }
+        
+        // Wake up I/O thread if it was waiting for space
+        if should_wakeup {
+            let val = 1u64;
+            unsafe {
+                libc::write(self.wakeup_fd, &val as *const u64 as *const libc::c_void, 8);
+            }
+        }
+    }
+}
+
+impl Drop for SharedPtyBuffer {
+    fn drop(&mut self) {
+        unsafe {
+            libc::close(self.wakeup_fd);
+        }
    }
 }

@@ -331,33 +431,66 @@ impl SplitNode {
    }
    
    /// Calculate layout for all nodes given the available space.
-    fn layout(&mut self, x: f32, y: f32, width: f32, height: f32, cell_width: f32, cell_height: f32, border_width: f32) {
+    /// Returns the actual used (width, height) after cell alignment.
+    /// Note: border_width is kept for API compatibility but borders are now overlaid on panes.
+    fn layout(&mut self, x: f32, y: f32, width: f32, height: f32, cell_width: f32, cell_height: f32, _border_width: f32) -> (f32, f32) {
        match self {
            SplitNode::Leaf { geometry, .. } => {
-                let cols = ((width - border_width) / cell_width).floor() as usize;
-                let rows = ((height - border_width) / cell_height).floor() as usize;
+                // Calculate how many cells fit
+                let cols = (width / cell_width).floor() as usize;
+                let rows = (height / cell_height).floor() as usize;
+                // Store actual cell-aligned dimensions (not allocated space)
+                let actual_width = cols.max(1) as f32 * cell_width;
+                let actual_height = rows.max(1) as f32 * cell_height;
                *geometry = PaneGeometry {
                    x,
                    y,
-                    width,
-                    height,
+                    width: actual_width,
+                    height: actual_height,
                    cols: cols.max(1),
                    rows: rows.max(1),
                };
+                (actual_width, actual_height)
            }
            SplitNode::Split { horizontal, ratio, first, second } => {
                if *horizontal {
-                    // Side-by-side split
-                    let first_width = (width * *ratio) - border_width / 2.0;
-                    let second_width = width - first_width - border_width;
-                    first.layout(x, y, first_width, height, cell_width, cell_height, border_width);
-                    second.layout(x + first_width + border_width, y, second_width, height, cell_width, cell_height, border_width);
+                    // Side-by-side split (horizontal means panes are side-by-side)
+                    // No border space reserved - border will be overlaid on pane edges
+                    let total_cols = (width / cell_width).floor() as usize;
+                    
+                    // Distribute columns by ratio
+                    let first_cols = ((total_cols as f32) * *ratio).round() as usize;
+                    let second_cols = total_cols.saturating_sub(first_cols);
+                    
+                    // Convert back to pixel widths
+                    let first_alloc_width = first_cols.max(1) as f32 * cell_width;
+                    let second_alloc_width = second_cols.max(1) as f32 * cell_width;
+                    
+                    // Layout panes flush against each other (border overlays the edge)
+                    let (first_actual_w, first_actual_h) = first.layout(x, y, first_alloc_width, height, cell_width, cell_height, _border_width);
+                    let (second_actual_w, second_actual_h) = second.layout(x + first_actual_w, y, second_alloc_width, height, cell_width, cell_height, _border_width);
+                    
+                    // Total used size: both panes (no border gap)
+                    (first_actual_w + second_actual_w, first_actual_h.max(second_actual_h))
                } else {
-                    // Stacked split
-                    let first_height = (height * *ratio) - border_width / 2.0;
-                    let second_height = height - first_height - border_width;
-                    first.layout(x, y, width, first_height, cell_width, cell_height, border_width);
-                    second.layout(x, y + first_height + border_width, width, second_height, cell_width, cell_height, border_width);
+                    // Stacked split (vertical means panes are stacked)
+                    // No border space reserved - border will be overlaid on pane edges
+                    let total_rows = (height / cell_height).floor() as usize;
+                    
+                    // Distribute rows by ratio
+                    let first_rows = ((total_rows as f32) * *ratio).round() as usize;
+                    let second_rows = total_rows.saturating_sub(first_rows);
+                    
+                    // Convert back to pixel heights
+                    let first_alloc_height = first_rows.max(1) as f32 * cell_height;
+                    let second_alloc_height = second_rows.max(1) as f32 * cell_height;
+                    
+                    // Layout panes flush against each other (border overlays the edge)
+                    let (first_actual_w, first_actual_h) = first.layout(x, y, width, first_alloc_height, cell_width, cell_height, _border_width);
+                    let (second_actual_w, second_actual_h) = second.layout(x, y + first_actual_h, width, second_alloc_height, cell_width, cell_height, _border_width);
+                    
+                    // Total used size: both panes (no border gap)
+                    (first_actual_w.max(second_actual_w), first_actual_h + second_actual_h)
                }
            }
        }
@@ -569,6 +702,9 @@ struct Tab {
    /// Tab title (from OSC or shell).
    #[allow(dead_code)]
    title: String,
+    /// Actual used grid dimensions (width, height) after cell alignment.
+    /// Used for centering the grid in the window.
+    grid_used_dimensions: (f32, f32),
 }

 impl Tab {
@@ -586,6 +722,7 @@ impl Tab {
            split_root: SplitNode::leaf(pane_id),
            active_pane: pane_id,
            title: String::from("zsh"),
+            grid_used_dimensions: (0.0, 0.0), // Will be set on first resize
        })
    }
    
@@ -601,8 +738,11 @@ impl Tab {
    
    /// Resize all panes based on new window dimensions.
    fn resize(&mut self, width: f32, height: f32, cell_width: f32, cell_height: f32, border_width: f32) {
-        // Recalculate layout
-        self.split_root.layout(0.0, 0.0, width, height, cell_width, cell_height, border_width);
+        // Recalculate layout - returns actual used dimensions for centering
+        let used_dims = self.split_root.layout(0.0, 0.0, width, height, cell_width, cell_height, border_width);
+        
+        // Store the used dimensions for this tab
+        self.grid_used_dimensions = used_dims;
        
        // Resize each pane's terminal based on its geometry
        let mut geometries = Vec::new();
@@ -1149,8 +1289,6 @@ struct App {
    edge_glows: Vec<EdgeGlow>,
 }

-const PTY_KEY: usize = 1;
-
 impl App {
    fn new() -> Self {
        let config = Config::load();
@@ -1265,11 +1403,14 @@ impl App {
    fn start_pane_io_thread_with_info(&self, pane_id: PaneId, pty_fd: i32, pty_buffer: Arc<SharedPtyBuffer>) {
        let Some(proxy) = self.event_loop_proxy.clone() else { return };
        let shutdown = self.shutdown.clone();
+        let wakeup_fd = pty_buffer.wakeup_fd();
        
        std::thread::Builder::new()
            .name(format!("pty-io-{}", pane_id.0))
            .spawn(move || {
                const INPUT_DELAY: Duration = Duration::from_millis(3);
+                const PTY_KEY: usize = 0;
+                const WAKEUP_KEY: usize = 1;
                
                let poller = match Poller::new() {
                    Ok(p) => p,
@@ -1279,11 +1420,17 @@ impl App {
                    }
                };
                
+                // Add PTY fd
                unsafe {
                    if let Err(e) = poller.add(pty_fd, Event::readable(PTY_KEY)) {
                        log::error!("Failed to add PTY to poller: {}", e);
                        return;
                    }
+                    // Add wakeup fd - used to wake us when buffer space becomes available
+                    if let Err(e) = poller.add(wakeup_fd, Event::readable(WAKEUP_KEY)) {
+                        log::error!("Failed to add wakeup fd to poller: {}", e);
+                        return;
+                    }
                }
                
                let mut events = Events::new();
@@ -1293,52 +1440,75 @@ impl App {
                while !shutdown.load(Ordering::Relaxed) {
                    events.clear();
                    
-                    let has_space = pty_buffer.has_space();
+                    // Check if we have space - if not, disable PTY polling until woken
+                    let has_space = pty_buffer.check_space_or_wait();
+                    
+                    // Set up poll events: always listen on wakeup_fd, only listen on pty_fd if we have space
+                    unsafe {
+                        let pty_event = if has_space { Event::readable(PTY_KEY) } else { Event::none(PTY_KEY) };
+                        let _ = poller.modify(std::os::fd::BorrowedFd::borrow_raw(pty_fd), pty_event);
+                    }
                    
                    let timeout = if has_pending_wakeup {
                        let elapsed = last_wakeup_at.elapsed();
                        Some(INPUT_DELAY.saturating_sub(elapsed))
                    } else {
-                        Some(Duration::from_millis(100))
+                        None // Block indefinitely until data or wakeup
                    };
                    
                    match poller.wait(&mut events, timeout) {
-                        Ok(_) if !events.is_empty() && has_space => {
-                            loop {
-                                let result = pty_buffer.read_from_fd(pty_fd);
-                                if result < 0 {
-                                    let err = std::io::Error::last_os_error();
-                                    if err.kind() == std::io::ErrorKind::Interrupted {
-                                        continue;
-                                    }
-                                    if err.kind() == std::io::ErrorKind::WouldBlock {
-                                        break;
-                                    }
-                                    log::debug!("PTY read error: {}", err);
-                                    break;
-                                } else if result == 0 {
-                                    break;
-                                } else {
-                                    has_pending_wakeup = true;
-                                    continue;
+                        Ok(_) => {
+                            let mut got_wakeup = false;
+                            let mut got_pty_data = false;
+                            
+                            for ev in events.iter() {
+                                if ev.key == WAKEUP_KEY {
+                                    got_wakeup = true;
+                                }
+                                if ev.key == PTY_KEY && ev.readable {
+                                    got_pty_data = true;
                                }
                            }
                            
-                            let now = std::time::Instant::now();
-                            if now.duration_since(last_wakeup_at) >= INPUT_DELAY {
-                                let _ = proxy.send_event(UserEvent::PtyReadable(pane_id));
-                                last_wakeup_at = now;
-                                has_pending_wakeup = false;
+                            // Drain wakeup fd if signaled
+                            if got_wakeup {
+                                pty_buffer.drain_wakeup();
+                                // Re-arm wakeup fd
+                                unsafe {
+                                    let _ = poller.modify(
+                                        std::os::fd::BorrowedFd::borrow_raw(wakeup_fd),
+                                        Event::readable(WAKEUP_KEY),
+                                    );
+                                }
                            }
                            
-                            unsafe {
-                                let _ = poller.modify(
-                                    std::os::fd::BorrowedFd::borrow_raw(pty_fd),
-                                    Event::readable(PTY_KEY),
-                                );
+                            // Read PTY data if available and we have space
+                            if got_pty_data && has_space {
+                                loop {
+                                    let result = pty_buffer.read_from_fd(pty_fd);
+                                    if result < 0 {
+                                        let err = std::io::Error::last_os_error();
+                                        if err.kind() == std::io::ErrorKind::Interrupted {
+                                            continue;
+                                        }
+                                        if err.kind() == std::io::ErrorKind::WouldBlock {
+                                            break;
+                                        }
+                                        log::debug!("PTY read error: {}", err);
+                                        break;
+                                    } else if result == 0 {
+                                        break;
+                                    } else {
+                                        has_pending_wakeup = true;
+                                        // Check if buffer became full
+                                        if !pty_buffer.check_space_or_wait() {
+                                            break;
+                                        }
+                                    }
+                                }
                            }
-                        }
-                        Ok(_) => {
+                            
+                            // Send wakeup to main thread if we have pending data and enough time passed
                            if has_pending_wakeup {
                                let now = std::time::Instant::now();
                                if now.duration_since(last_wakeup_at) >= INPUT_DELAY {
@@ -1349,8 +1519,10 @@ impl App {
                            }
                        }
                        Err(e) => {
-                            log::error!("PTY poll error: {}", e);
-                            break;
+                            if e.kind() != std::io::ErrorKind::Interrupted {
+                                log::error!("PTY poll error: {}", e);
+                                break;
+                            }
                        }
                    }
                }
@@ -1409,22 +1581,34 @@ impl App {
    
    /// Resize all panes in all tabs based on renderer dimensions.
    fn resize_all_panes(&mut self) {
-        let Some(renderer) = &self.renderer else { return };
+        // Extract values we need from renderer first
+        // Use raw available pixel space so layout can handle cell alignment properly
+        let (cell_width, cell_height, available_width, available_height) = {
+            let Some(renderer) = &self.renderer else { return };
+            let cell_width = renderer.cell_width;
+            let cell_height = renderer.cell_height;
+            let (available_width, available_height) = renderer.available_grid_space();
+            (cell_width, cell_height, available_width, available_height)
+        };
        
-        let cell_width = renderer.cell_width;
-        let cell_height = renderer.cell_height;
-        let width = renderer.width as f32;
-        let height = renderer.height as f32 - renderer.tab_bar_height() - renderer.statusline_height();
        let border_width = 2.0; // Border width in pixels
        
-        for tab in &mut self.tabs {
-            tab.resize(width, height, cell_width, cell_height, border_width);
+        for tab in self.tabs.iter_mut() {
+            tab.resize(available_width, available_height, cell_width, cell_height, border_width);
            
            // Update cell size on all terminals (needed for Kitty graphics protocol)
            for pane in tab.panes.values_mut() {
                pane.terminal.set_cell_size(cell_width, cell_height);
            }
        }
+        
+        // Update the renderer with the active tab's used dimensions for proper centering
+        if let Some(tab) = self.tabs.get(self.active_tab) {
+            let used_dims = tab.grid_used_dimensions;
+            if let Some(renderer) = &mut self.renderer {
+                renderer.set_grid_used_dimensions(used_dims.0, used_dims.1);
+            }
+        }
    }
    
    /// Process PTY data for a specific pane.
@@ -1436,18 +1620,19 @@ impl App {
        
        for tab in &mut self.tabs {
            if let Some(pane) = tab.get_pane_mut(pane_id) {
-                // Take all pending data atomically
-                let data = pane.pty_buffer.take_pending();
+                // Get slice of pending data - zero copy!
+                let Some(data) = pane.pty_buffer.get_read_slice() else {
+                    return false;
+                };
                let len = data.len();
                
-                if len == 0 {
-                    return false;
-                }
-                
                let process_start = std::time::Instant::now();
-                pane.terminal.process(&data);
+                pane.terminal.process(data);
                let process_time_ns = process_start.elapsed().as_nanos() as u64;
                
+                // Consume the data now that we're done parsing
+                pane.pty_buffer.consume_all();
+                
                if process_time_ns > 5_000_000 {
                    log::info!("PTY: process={:.2}ms bytes={}",
                        process_time_ns as f64 / 1_000_000.0,
@@ -1642,22 +1827,18 @@ impl App {
            }
            Action::NextTab => {
                if !self.tabs.is_empty() {
-                    self.active_tab = (self.active_tab + 1) % self.tabs.len();
-                    if let Some(window) = &self.window {
-                        window.request_redraw();
-                    }
+                    let next_tab = (self.active_tab + 1) % self.tabs.len();
+                    self.switch_to_tab(next_tab);
                }
            }
            Action::PrevTab => {
                if !self.tabs.is_empty() {
-                    self.active_tab = if self.active_tab == 0 {
+                    let prev_tab = if self.active_tab == 0 {
                        self.tabs.len() - 1
                    } else {
                        self.active_tab - 1
                    };
-                    if let Some(window) = &self.window {
-                        window.request_redraw();
-                    }
+                    self.switch_to_tab(prev_tab);
                }
            }
            Action::Tab1 => self.switch_to_tab(0),
@@ -1816,12 +1997,24 @@ impl App {
    fn switch_to_tab(&mut self, idx: usize) {
        if idx < self.tabs.len() {
            self.active_tab = idx;
+            // Update grid dimensions for proper centering of the new active tab
+            self.update_active_tab_grid_dimensions();
            if let Some(window) = &self.window {
                window.request_redraw();
            }
        }
    }
    
+    /// Update the renderer's grid dimensions based on the active tab's stored dimensions.
+    fn update_active_tab_grid_dimensions(&mut self) {
+        if let Some(tab) = self.tabs.get(self.active_tab) {
+            let used_dims = tab.grid_used_dimensions;
+            if let Some(renderer) = &mut self.renderer {
+                renderer.set_grid_used_dimensions(used_dims.0, used_dims.1);
+            }
+        }
+    }
+    
    fn paste_from_clipboard(&mut self) {
        let output = match Command::new("wl-paste")
            .arg("--no-newline")
@@ -2030,9 +2223,19 @@ impl ApplicationHandler<UserEvent> for App {
                self.poll_pane(pane_id);
                let process_time = start.elapsed();
                
-                // Request redraw to display the new content
-                if let Some(window) = &self.window {
-                    window.request_redraw();
+                // Check if terminal is in synchronized output mode (DCS pending mode or CSI 2026)
+                // If so, skip the redraw - rendering will happen when sync mode ends
+                let synchronized = self.tabs.iter()
+                    .flat_map(|tab| tab.panes.values())
+                    .find(|pane| pane.id == pane_id)
+                    .map(|pane| pane.terminal.is_synchronized())
+                    .unwrap_or(false);
+                
+                // Request redraw to display the new content (unless in sync mode)
+                if !synchronized {
+                    if let Some(window) = &self.window {
+                        window.request_redraw();
+                    }
                }
                
                if process_time.as_millis() > 5 {
@@ -2309,6 +2512,7 @@ impl ApplicationHandler<UserEvent> for App {
                                };
                                
                                let render_info = PaneRenderInfo {
+                                    pane_id: pane_id.0,
                                    x: geom.x,
                                    y: geom.y,
                                    width: geom.width,
@@ -2416,10 +2620,12 @@ impl ApplicationHandler<UserEvent> for App {
        
        // Check for exited tabs and remove them
        let mut i = 0;
+        let mut tabs_removed = false;
        while i < self.tabs.len() {
            if self.tabs[i].child_exited() {
                log::info!("Tab {} shell exited", i);
                self.tabs.remove(i);
+                tabs_removed = true;
                if self.active_tab >= self.tabs.len() && !self.tabs.is_empty() {
                    self.active_tab = self.tabs.len() - 1;
                }
@@ -2428,6 +2634,11 @@ impl ApplicationHandler<UserEvent> for App {
            }
        }
        
+        // Update grid dimensions if tabs were removed
+        if tabs_removed && !self.tabs.is_empty() {
+            self.update_active_tab_grid_dimensions();
+        }
+        
        if self.tabs.is_empty() {
            log::info!("All tabs closed, exiting");
            event_loop.exit();
@@ -0,0 +1,95 @@
+// Instanced quad rendering shader for rectangles, borders, overlays, and tab bar
+// Simple shader that renders colored rectangles using instancing
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// DATA STRUCTURES
+// ═══════════════════════════════════════════════════════════════════════════════
+
+// Quad instance data - stored in a storage buffer
+// Each quad has position, size, and color
+struct Quad {
+    // Position in pixels (x, y)
+    x: f32,
+    y: f32,
+    // Size in pixels (width, height)
+    width: f32,
+    height: f32,
+    // Color (linear RGBA)
+    color: vec4<f32>,
+}
+
+// Quad rendering uniforms (screen dimensions)
+struct QuadParams {
+    screen_width: f32,
+    screen_height: f32,
+    _padding: vec2<f32>,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// BINDINGS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+// Uniform for screen dimensions
+@group(0) @binding(0)
+var<uniform> quad_params: QuadParams;
+
+// Storage buffer for quad instances
+@group(0) @binding(1)
+var<storage, read> quads: array<Quad>;
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// VERTEX OUTPUT
+// ═══════════════════════════════════════════════════════════════════════════════
+
+struct QuadVertexOutput {
+    @builtin(position) clip_position: vec4<f32>,
+    @location(0) color: vec4<f32>,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// HELPER FUNCTIONS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+// Convert pixel coordinate to NDC
+fn pixel_to_ndc(pixel: vec2<f32>, screen: vec2<f32>) -> vec2<f32> {
+    return vec2<f32>(
+        (pixel.x / screen.x) * 2.0 - 1.0,
+        1.0 - (pixel.y / screen.y) * 2.0
+    );
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// VERTEX SHADER
+// ═══════════════════════════════════════════════════════════════════════════════
+
+@vertex
+fn vs_quad(
+    @builtin(vertex_index) vertex_index: u32,
+    @builtin(instance_index) instance_index: u32
+) -> QuadVertexOutput {
+    let quad = quads[instance_index];
+    
+    // Quad vertex positions for TriangleStrip (0=top-left, 1=top-right, 2=bottom-left, 3=bottom-right)
+    var positions: array<vec2<f32>, 4>;
+    positions[0] = vec2<f32>(quad.x, quad.y);                           // top-left
+    positions[1] = vec2<f32>(quad.x + quad.width, quad.y);              // top-right
+    positions[2] = vec2<f32>(quad.x, quad.y + quad.height);             // bottom-left
+    positions[3] = vec2<f32>(quad.x + quad.width, quad.y + quad.height); // bottom-right
+    
+    let screen_size = vec2<f32>(quad_params.screen_width, quad_params.screen_height);
+    let ndc_pos = pixel_to_ndc(positions[vertex_index], screen_size);
+    
+    var out: QuadVertexOutput;
+    out.clip_position = vec4<f32>(ndc_pos, 0.0, 1.0);
+    out.color = quad.color;
+    return out;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// FRAGMENT SHADER
+// ═══════════════════════════════════════════════════════════════════════════════
+
+@fragment
+fn fs_quad(in: QuadVertexOutput) -> @location(0) vec4<f32> {
+    return in.color;
+}
@@ -0,0 +1,323 @@
+// Statusline shader - optimized for single-row text rendering
+// Simpler than the full terminal cell shader, focused on text with colors
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// GAMMA CONVERSION FUNCTIONS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+// Luminance weights for perceived brightness (ITU-R BT.709)
+const Y: vec3<f32> = vec3<f32>(0.2126, 0.7152, 0.0722);
+
+// Convert sRGB to linear RGB
+fn srgb2linear(x: f32) -> f32 {
+    if x <= 0.04045 {
+        return x / 12.92;
+    } else {
+        return pow((x + 0.055) / 1.055, 2.4);
+    }
+}
+
+// Convert linear RGB to sRGB
+fn linear2srgb(x: f32) -> f32 {
+    if x <= 0.0031308 {
+        return 12.92 * x;
+    } else {
+        return 1.055 * pow(x, 1.0 / 2.4) - 0.055;
+    }
+}
+
+// Kitty's legacy gamma-incorrect text blending for crisp rendering
+fn foreground_contrast_legacy(over_srgb: vec3<f32>, over_alpha: f32, under_srgb: vec3<f32>) -> f32 {
+    let over_linear = vec3<f32>(srgb2linear(over_srgb.r), srgb2linear(over_srgb.g), srgb2linear(over_srgb.b));
+    let under_linear = vec3<f32>(srgb2linear(under_srgb.r), srgb2linear(under_srgb.g), srgb2linear(under_srgb.b));
+    
+    let under_luminance = dot(under_linear, Y);
+    let over_luminance = dot(over_linear, Y);
+    
+    let luminance_diff = over_luminance - under_luminance;
+    if abs(luminance_diff) < 0.001 {
+        return over_alpha;
+    }
+    
+    let blended_srgb = linear2srgb(over_luminance) * over_alpha + linear2srgb(under_luminance) * (1.0 - over_alpha);
+    let blended_linear = srgb2linear(blended_srgb);
+    let new_alpha = (blended_linear - under_luminance) / luminance_diff;
+    
+    return clamp(new_alpha, 0.0, 1.0);
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// STATUSLINE DATA STRUCTURES
+// ═══════════════════════════════════════════════════════════════════════════════
+
+// Per-cell data for statusline rendering
+// Matches GPUCell struct in renderer.rs exactly for buffer compatibility
+struct StatuslineCell {
+    // Foreground color (packed: type in low byte, color data in upper bytes)
+    fg: u32,
+    // Background color (packed same way)
+    bg: u32,
+    // Decoration foreground color (unused in statusline, but needed for struct alignment)
+    decoration_fg: u32,
+    // Sprite index in atlas (0 = no glyph/space). High bit = colored glyph.
+    sprite_idx: u32,
+    // Cell attributes (unused in statusline, but needed for struct alignment)
+    attrs: u32,
+}
+
+// Sprite info for glyph positioning
+struct SpriteInfo {
+    // UV coordinates in atlas (x, y, width, height) - normalized 0-1
+    uv: vec4<f32>,
+    // Padding
+    _padding: vec2<f32>,
+    // Size in pixels (width, height)
+    size: vec2<f32>,
+}
+
+// Statusline parameters uniform
+struct StatuslineParams {
+    // Number of characters in statusline
+    char_count: u32,
+    // Cell dimensions in pixels
+    cell_width: f32,
+    cell_height: f32,
+    // Screen dimensions in pixels
+    screen_width: f32,
+    screen_height: f32,
+    // Y position of statusline (in pixels from top)
+    y_offset: f32,
+    // Padding for alignment
+    _padding: vec2<f32>,
+}
+
+// Color table for indexed colors
+struct ColorTable {
+    colors: array<vec4<f32>, 258>,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// BINDINGS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+@group(0) @binding(0)
+var atlas_texture: texture_2d<f32>;
+@group(0) @binding(1)
+var atlas_sampler: sampler;
+
+@group(1) @binding(0)
+var<uniform> color_table: ColorTable;
+
+@group(1) @binding(1)
+var<uniform> params: StatuslineParams;
+
+@group(1) @binding(2)
+var<storage, read> cells: array<StatuslineCell>;
+
+@group(1) @binding(3)
+var<storage, read> sprites: array<SpriteInfo>;
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// CONSTANTS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+const COLOR_TYPE_DEFAULT: u32 = 0u;
+const COLOR_TYPE_INDEXED: u32 = 1u;
+const COLOR_TYPE_RGB: u32 = 2u;
+
+const COLORED_GLYPH_FLAG: u32 = 0x80000000u;
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// VERTEX OUTPUT
+// ═══════════════════════════════════════════════════════════════════════════════
+
+struct VertexOutput {
+    @builtin(position) clip_position: vec4<f32>,
+    @location(0) uv: vec2<f32>,
+    @location(1) fg_color: vec4<f32>,
+    @location(2) bg_color: vec4<f32>,
+    @location(3) @interpolate(flat) is_background: u32,
+    @location(4) @interpolate(flat) is_colored_glyph: u32,
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// HELPER FUNCTIONS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+// Resolve a packed color to RGBA
+fn resolve_color(packed: u32, is_foreground: bool) -> vec4<f32> {
+    let color_type = packed & 0xFFu;
+    
+    if color_type == COLOR_TYPE_DEFAULT {
+        if is_foreground {
+            return color_table.colors[256];
+        } else {
+            return color_table.colors[257];
+        }
+    } else if color_type == COLOR_TYPE_INDEXED {
+        let index = (packed >> 8u) & 0xFFu;
+        return color_table.colors[index];
+    } else {
+        // RGB color
+        let r = f32((packed >> 8u) & 0xFFu) / 255.0;
+        let g = f32((packed >> 16u) & 0xFFu) / 255.0;
+        let b = f32((packed >> 24u) & 0xFFu) / 255.0;
+        return vec4<f32>(srgb2linear(r), srgb2linear(g), srgb2linear(b), 1.0);
+    }
+}
+
+// Convert pixel coordinate to NDC
+fn pixel_to_ndc(pixel: vec2<f32>, screen: vec2<f32>) -> vec2<f32> {
+    return vec2<f32>(
+        (pixel.x / screen.x) * 2.0 - 1.0,
+        1.0 - (pixel.y / screen.y) * 2.0
+    );
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// BACKGROUND VERTEX SHADER
+// ═══════════════════════════════════════════════════════════════════════════════
+
+@vertex
+fn vs_statusline_bg(
+    @builtin(vertex_index) vertex_index: u32,
+    @builtin(instance_index) instance_index: u32
+) -> VertexOutput {
+    // Skip if out of bounds
+    if instance_index >= params.char_count {
+        var out: VertexOutput;
+        out.clip_position = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        return out;
+    }
+    
+    let cell = cells[instance_index];
+    
+    // Calculate cell position (single row, left to right)
+    let cell_x = f32(instance_index) * params.cell_width;
+    let cell_y = params.y_offset;
+    
+    // Quad vertex positions for TriangleStrip
+    var positions: array<vec2<f32>, 4>;
+    positions[0] = vec2<f32>(cell_x, cell_y);
+    positions[1] = vec2<f32>(cell_x + params.cell_width, cell_y);
+    positions[2] = vec2<f32>(cell_x, cell_y + params.cell_height);
+    positions[3] = vec2<f32>(cell_x + params.cell_width, cell_y + params.cell_height);
+    
+    let screen_size = vec2<f32>(params.screen_width, params.screen_height);
+    let ndc_pos = pixel_to_ndc(positions[vertex_index], screen_size);
+    
+    let fg = resolve_color(cell.fg, true);
+    var bg = resolve_color(cell.bg, false);
+    
+    // For default background, use transparent
+    let bg_type = cell.bg & 0xFFu;
+    if bg_type == COLOR_TYPE_DEFAULT {
+        bg.a = 0.0;
+    }
+    
+    var out: VertexOutput;
+    out.clip_position = vec4<f32>(ndc_pos, 0.0, 1.0);
+    out.uv = vec2<f32>(0.0, 0.0);
+    out.fg_color = fg;
+    out.bg_color = bg;
+    out.is_background = 1u;
+    out.is_colored_glyph = 0u;
+    
+    return out;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// GLYPH VERTEX SHADER
+// ═══════════════════════════════════════════════════════════════════════════════
+
+@vertex
+fn vs_statusline_glyph(
+    @builtin(vertex_index) vertex_index: u32,
+    @builtin(instance_index) instance_index: u32
+) -> VertexOutput {
+    // Skip if out of bounds
+    if instance_index >= params.char_count {
+        var out: VertexOutput;
+        out.clip_position = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        return out;
+    }
+    
+    let cell = cells[instance_index];
+    let sprite_idx = cell.sprite_idx & ~COLORED_GLYPH_FLAG;
+    let is_colored = (cell.sprite_idx & COLORED_GLYPH_FLAG) != 0u;
+    
+    // Skip if no glyph
+    if sprite_idx == 0u {
+        var out: VertexOutput;
+        out.clip_position = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        return out;
+    }
+    
+    let sprite = sprites[sprite_idx];
+    
+    // Skip if sprite has no size
+    if sprite.size.x <= 0.0 || sprite.size.y <= 0.0 {
+        var out: VertexOutput;
+        out.clip_position = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        return out;
+    }
+    
+    // Calculate glyph position
+    let glyph_x = f32(instance_index) * params.cell_width;
+    let glyph_y = params.y_offset;
+    
+    // Quad vertex positions
+    var positions: array<vec2<f32>, 4>;
+    positions[0] = vec2<f32>(glyph_x, glyph_y);
+    positions[1] = vec2<f32>(glyph_x + sprite.size.x, glyph_y);
+    positions[2] = vec2<f32>(glyph_x, glyph_y + sprite.size.y);
+    positions[3] = vec2<f32>(glyph_x + sprite.size.x, glyph_y + sprite.size.y);
+    
+    // UV coordinates
+    var uvs: array<vec2<f32>, 4>;
+    uvs[0] = vec2<f32>(sprite.uv.x, sprite.uv.y);
+    uvs[1] = vec2<f32>(sprite.uv.x + sprite.uv.z, sprite.uv.y);
+    uvs[2] = vec2<f32>(sprite.uv.x, sprite.uv.y + sprite.uv.w);
+    uvs[3] = vec2<f32>(sprite.uv.x + sprite.uv.z, sprite.uv.y + sprite.uv.w);
+    
+    let screen_size = vec2<f32>(params.screen_width, params.screen_height);
+    let ndc_pos = pixel_to_ndc(positions[vertex_index], screen_size);
+    
+    let fg = resolve_color(cell.fg, true);
+    let bg = resolve_color(cell.bg, false);
+    
+    var out: VertexOutput;
+    out.clip_position = vec4<f32>(ndc_pos, 0.0, 1.0);
+    out.uv = uvs[vertex_index];
+    out.fg_color = fg;
+    out.bg_color = bg;
+    out.is_background = 0u;
+    out.is_colored_glyph = select(0u, 1u, is_colored);
+    
+    return out;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// FRAGMENT SHADER
+// ═══════════════════════════════════════════════════════════════════════════════
+
+@fragment
+fn fs_statusline(in: VertexOutput) -> @location(0) vec4<f32> {
+    if in.is_background == 1u {
+        return in.bg_color;
+    }
+    
+    // Sample glyph from atlas
+    let glyph_sample = textureSample(atlas_texture, atlas_sampler, in.uv);
+    
+    if in.is_colored_glyph == 1u {
+        // Colored glyph (emoji) - use atlas color directly
+        return glyph_sample;
+    }
+    
+    // Regular glyph - apply foreground color with legacy gamma blending
+    let glyph_alpha = glyph_sample.a;
+    let adjusted_alpha = foreground_contrast_legacy(in.fg_color.rgb, glyph_alpha, in.bg_color.rgb);
+    
+    return vec4<f32>(in.fg_color.rgb, in.fg_color.a * adjusted_alpha);
+}
@@ -3,6 +3,7 @@
 use crate::graphics::{GraphicsCommand, ImageStorage};
 use crate::keyboard::{query_response, KeyboardState};
 use crate::vt_parser::{CsiParams, Handler, Parser};
+use unicode_width::UnicodeWidthChar;

 /// Commands that the terminal can send to the application.
 /// These are triggered by special escape sequences from programs like Neovim.
@@ -35,6 +36,9 @@ pub struct Cell {
    pub bold: bool,
    pub italic: bool,
    pub underline: bool,
+    /// If true, this cell is the continuation of a wide (double-width) character.
+    /// The actual character is stored in the previous cell.
+    pub wide_continuation: bool,
 }

 impl Default for Cell {
@@ -46,6 +50,7 @@ impl Default for Cell {
            bold: false,
            italic: false,
            underline: false,
+            wide_continuation: false,
        }
    }
 }
@@ -621,6 +626,13 @@ impl Terminal {
        self.dirty_lines[0]
    }
    
+    /// Check if synchronized output mode is active (rendering should be suppressed).
+    /// This is set by CSI 2026 or DCS pending mode (=1s/=2s).
+    #[inline]
+    pub fn is_synchronized(&self) -> bool {
+        self.synchronized_output
+    }
+    
    /// Get the actual grid row index for a visual row.
    #[inline]
    pub fn grid_row(&self, visual_row: usize) -> usize {
@@ -660,6 +672,7 @@ impl Terminal {
            bold: false,
            italic: false,
            underline: false,
+            wide_continuation: false,
        }
    }

@@ -1225,6 +1238,9 @@ impl Handler for Terminal {
        let mut cached_row = self.cursor_row;
        let mut grid_row = self.line_map[cached_row];
        
+        // Mark the initial line as dirty (like Kitty's init_text_loop_line)
+        self.mark_line_dirty(cached_row);
+        
        for &c in chars {
            match c {
                // Bell
@@ -1252,13 +1268,56 @@ impl Handler for Terminal {
                    // Update cache after line change
                    cached_row = self.cursor_row;
                    grid_row = self.line_map[cached_row];
+                    // Mark the new line as dirty
+                    self.mark_line_dirty(cached_row);
                }
                // Carriage return
                '\x0D' => {
                    self.cursor_col = 0;
                }
-                // Printable characters (including all Unicode)
-                c if c >= ' ' => {
+                // Fast path for printable ASCII (0x20-0x7E) - like Kitty
+                // ASCII is always width 1, never zero-width, never wide
+                c if c >= ' ' && c <= '~' => {
+                    // Handle wrap
+                    if self.cursor_col >= self.cols {
+                        if self.auto_wrap {
+                            self.cursor_col = 0;
+                            self.cursor_row += 1;
+                            if self.cursor_row > self.scroll_bottom {
+                                self.scroll_up(1);
+                                self.cursor_row = self.scroll_bottom;
+                            }
+                            cached_row = self.cursor_row;
+                            grid_row = self.line_map[cached_row];
+                            self.mark_line_dirty(cached_row);
+                        } else {
+                            self.cursor_col = self.cols - 1;
+                        }
+                    }
+                    
+                    // Write character directly - no wide char handling needed for ASCII
+                    self.grid[grid_row][self.cursor_col] = Cell {
+                        character: c,
+                        fg_color: self.current_fg,
+                        bg_color: self.current_bg,
+                        bold: self.current_bold,
+                        italic: self.current_italic,
+                        underline: self.current_underline,
+                        wide_continuation: false,
+                    };
+                    self.cursor_col += 1;
+                }
+                // Slow path for non-ASCII printable characters (including all Unicode)
+                c if c > '~' => {
+                    // Determine character width using Unicode Standard Annex #11
+                    let char_width = c.width().unwrap_or(1);
+                    
+                    // Skip zero-width characters (combining marks, etc.)
+                    if char_width == 0 {
+                        // TODO: Handle combining characters
+                        continue;
+                    }
+                    
                    // Handle wrap
                    if self.cursor_col >= self.cols {
                        if self.auto_wrap {
@@ -1271,16 +1330,47 @@ impl Handler for Terminal {
                            // Update cache after line change
                            cached_row = self.cursor_row;
                            grid_row = self.line_map[cached_row];
+                            // Mark the new line as dirty
+                            self.mark_line_dirty(cached_row);
                        } else {
                            self.cursor_col = self.cols - 1;
                        }
                    }
                    
+                    // For double-width characters at end of line, wrap first
+                    if char_width == 2 && self.cursor_col == self.cols - 1 {
+                        if self.auto_wrap {
+                            self.grid[grid_row][self.cursor_col] = Cell::default();
+                            self.cursor_col = 0;
+                            self.cursor_row += 1;
+                            if self.cursor_row > self.scroll_bottom {
+                                self.scroll_up(1);
+                                self.cursor_row = self.scroll_bottom;
+                            }
+                            cached_row = self.cursor_row;
+                            grid_row = self.line_map[cached_row];
+                            // Mark the new line as dirty
+                            self.mark_line_dirty(cached_row);
+                        } else {
+                            continue; // Can't fit
+                        }
+                    }
+                    
                    // Write character directly using cached grid_row
                    // Safety: ensure grid row has correct width (may differ after scrollback swap)
                    if self.grid[grid_row].len() != self.cols {
                        self.grid[grid_row].resize(self.cols, Cell::default());
                    }
+                    
+                    // Handle overwriting wide character cells
+                    if self.grid[grid_row][self.cursor_col].wide_continuation && self.cursor_col > 0 {
+                        self.grid[grid_row][self.cursor_col - 1] = Cell::default();
+                    }
+                    if char_width == 1 && self.cursor_col + 1 < self.cols 
+                       && self.grid[grid_row][self.cursor_col + 1].wide_continuation {
+                        self.grid[grid_row][self.cursor_col + 1] = Cell::default();
+                    }
+                    
                    self.grid[grid_row][self.cursor_col] = Cell {
                        character: c,
                        fg_color: self.current_fg,
@@ -1288,16 +1378,33 @@ impl Handler for Terminal {
                        bold: self.current_bold,
                        italic: self.current_italic,
                        underline: self.current_underline,
+                        wide_continuation: false,
                    };
                    self.cursor_col += 1;
+                    
+                    // For double-width, write continuation cell
+                    if char_width == 2 && self.cursor_col < self.cols {
+                        if self.cursor_col + 1 < self.cols 
+                           && self.grid[grid_row][self.cursor_col + 1].wide_continuation {
+                            self.grid[grid_row][self.cursor_col + 1] = Cell::default();
+                        }
+                        self.grid[grid_row][self.cursor_col] = Cell {
+                            character: ' ',
+                            fg_color: self.current_fg,
+                            bg_color: self.current_bg,
+                            bold: self.current_bold,
+                            italic: self.current_italic,
+                            underline: self.current_underline,
+                            wide_continuation: true,
+                        };
+                        self.cursor_col += 1;
+                    }
                }
                // Other control chars - ignore
                _ => {}
            }
        }
-        
-        // Mark all lines dirty at the end (we touched many lines)
-        self.mark_all_lines_dirty();
+        // Dirty lines are marked incrementally above - no need for mark_all_lines_dirty()
    }

    /// Handle control characters embedded in escape sequences.
@@ -1458,6 +1565,40 @@ impl Handler for Terminal {
        self.handle_apc(data);
    }

+    /// Handle a DCS (Device Control String) sequence.
+    /// Used for pending mode (synchronized output via DCS).
+    fn dcs(&mut self, data: &[u8]) {
+        // DCS pending mode: =1s to start, =2s to stop
+        // This is an alternative to CSI 2026 for synchronized output
+        if data.len() >= 3 && data[0] == b'=' && data[2] == b's' {
+            match data[1] {
+                b'1' => {
+                    // Start pending mode (pause rendering)
+                    if self.synchronized_output {
+                        log::warn!("Pending mode start requested while already in pending mode");
+                    }
+                    self.synchronized_output = true;
+                    log::trace!("DCS pending mode started (=1s)");
+                }
+                b'2' => {
+                    // Stop pending mode (resume rendering)
+                    if !self.synchronized_output {
+                        log::warn!("Pending mode stop requested while not in pending mode");
+                    }
+                    self.synchronized_output = false;
+                    self.dirty = true; // Force a redraw
+                    log::trace!("DCS pending mode stopped (=2s)");
+                }
+                _ => {
+                    log::debug!("Unknown DCS pending mode command: {:?}", data);
+                }
+            }
+        } else {
+            log::debug!("Unhandled DCS sequence: {:?}", 
+                std::str::from_utf8(data).unwrap_or("<invalid utf8>"));
+        }
+    }
+
    /// Handle a complete CSI sequence.
    fn csi(&mut self, params: &CsiParams) {
        let action = params.final_char as char;
@@ -1836,6 +1977,7 @@ impl Handler for Terminal {
                    bold: false,
                    italic: false,
                    underline: false,
+                    wide_continuation: false,
                };
            }
            self.mark_line_dirty(visual_row);
@@ -1845,8 +1987,22 @@ impl Handler for Terminal {

 impl Terminal {
    /// Print a single character at the cursor position.
+    /// Handles double-width characters (emoji, CJK) by occupying two cells.
    #[inline]
    fn print_char(&mut self, c: char) {
+        // Determine character width using Unicode Standard Annex #11
+        // Width 2 = double-width (emoji, CJK, etc.)
+        // Width 1 = normal width
+        // Width 0 = combining/non-spacing marks (handled separately)
+        let char_width = c.width().unwrap_or(1);
+        
+        // Skip zero-width characters (combining marks, etc.)
+        if char_width == 0 {
+            // TODO: Handle combining characters by attaching to previous cell
+            return;
+        }
+        
+        // Check if we need to wrap before printing
        if self.cursor_col >= self.cols {
            if self.auto_wrap {
                self.cursor_col = 0;
@@ -1860,7 +2016,41 @@ impl Terminal {
            }
        }
        
+        // For double-width characters, check if there's room
+        // If at the last column, we need to wrap first
+        if char_width == 2 && self.cursor_col == self.cols - 1 {
+            if self.auto_wrap {
+                // Write a space in the last column and wrap
+                let grid_row = self.line_map[self.cursor_row];
+                self.grid[grid_row][self.cursor_col] = Cell::default();
+                self.cursor_col = 0;
+                self.cursor_row += 1;
+                if self.cursor_row > self.scroll_bottom {
+                    self.scroll_up(1);
+                    self.cursor_row = self.scroll_bottom;
+                }
+            } else {
+                // Can't fit, don't print
+                return;
+            }
+        }
+
        let grid_row = self.line_map[self.cursor_row];
+        
+        // If we're overwriting a wide character's continuation cell,
+        // we need to clear the first cell of that wide character
+        if self.grid[grid_row][self.cursor_col].wide_continuation && self.cursor_col > 0 {
+            self.grid[grid_row][self.cursor_col - 1] = Cell::default();
+        }
+        
+        // If we're overwriting the first cell of a wide character,
+        // we need to clear its continuation cell
+        if char_width == 1 && self.cursor_col + 1 < self.cols 
+           && self.grid[grid_row][self.cursor_col + 1].wide_continuation {
+            self.grid[grid_row][self.cursor_col + 1] = Cell::default();
+        }
+        
+        // Write the character to the first cell
        self.grid[grid_row][self.cursor_col] = Cell {
            character: c,
            fg_color: self.current_fg,
@@ -1868,9 +2058,31 @@ impl Terminal {
            bold: self.current_bold,
            italic: self.current_italic,
            underline: self.current_underline,
+            wide_continuation: false,
        };
        self.mark_line_dirty(self.cursor_row);
        self.cursor_col += 1;
+        
+        // For double-width characters, write a continuation marker to the second cell
+        if char_width == 2 && self.cursor_col < self.cols {
+            // If the next cell is the first cell of another wide character,
+            // clear its continuation cell
+            if self.cursor_col + 1 < self.cols 
+               && self.grid[grid_row][self.cursor_col + 1].wide_continuation {
+                self.grid[grid_row][self.cursor_col + 1] = Cell::default();
+            }
+            
+            self.grid[grid_row][self.cursor_col] = Cell {
+                character: ' ',  // Placeholder - renderer will skip this
+                fg_color: self.current_fg,
+                bg_color: self.current_bg,
+                bold: self.current_bold,
+                italic: self.current_italic,
+                underline: self.current_underline,
+                wide_continuation: true,
+            };
+            self.cursor_col += 1;
+        }
    }

    /// Handle SGR (Select Graphic Rendition) parameters.
@@ -9,12 +9,14 @@
 //! 2. Pass decoded codepoints to the text handler, not raw bytes
 //! 3. Control characters (LF, CR, TAB, BS, etc.) are handled inline in text drawing
 //! 4. Only ESC triggers state machine transitions
+//! 5. Use SIMD-accelerated byte search for finding escape sequence terminators

 /// Maximum number of CSI parameters.
 pub const MAX_CSI_PARAMS: usize = 256;

-/// Maximum length of an OSC string.
-const MAX_OSC_LEN: usize = 4096;
+/// Maximum length of an OSC string (same as escape length - no separate limit needed).
+/// Kitty doesn't have a separate OSC limit, just the overall escape sequence limit.
+const MAX_OSC_LEN: usize = 262144; // 256KB, same as MAX_ESCAPE_LEN

 /// Maximum length of an escape sequence before we give up.
 const MAX_ESCAPE_LEN: usize = 262144; // 256KB like Kitty
@@ -103,10 +105,11 @@ impl Utf8Decoder {
            let prev_state = self.state;
            match decode_utf8(&mut self.state, &mut self.codep, byte) {
                UTF8_ACCEPT => {
-                    // Safe because we control the codepoint values from valid UTF-8
-                    if let Some(c) = char::from_u32(self.codep) {
-                        output.push(c);
-                    }
+                    // SAFETY: The DFA decoder guarantees valid Unicode codepoints when
+                    // state is ACCEPT. This is the same guarantee that Kitty relies on.
+                    // Using unchecked avoids a redundant validity check in the hot path.
+                    let c = unsafe { char::from_u32_unchecked(self.codep) };
+                    output.push(c);
                }
                UTF8_REJECT => {
                    // Invalid UTF-8 sequence
@@ -211,9 +214,13 @@ impl Default for CsiParams {

 impl CsiParams {
    /// Reset for a new CSI sequence.
+    /// Note: We don't zero the params/is_sub_param arrays since they're written before being read.
+    /// This avoids zeroing 1280 bytes on every CSI sequence.
+    #[inline]
    pub fn reset(&mut self) {
-        self.params = [0; MAX_CSI_PARAMS];
-        self.is_sub_param = [false; MAX_CSI_PARAMS];
+        // Don't zero arrays - individual elements are written before being read
+        // self.params = [0; MAX_CSI_PARAMS];      // Skip - saves 1024 bytes memset
+        // self.is_sub_param = [false; MAX_CSI_PARAMS]; // Skip - saves 256 bytes memset
        self.num_params = 0;
        self.primary = 0;
        self.secondary = 0;
@@ -672,96 +679,103 @@ impl Parser {
        handler.csi(&self.csi);
    }

-    /// Process OSC sequence bytes.
+    /// Process OSC sequence bytes using SIMD-accelerated terminator search.
+    /// Like Kitty's find_st_terminator + accumulate_st_terminated_esc_code.
    fn consume_osc<H: Handler>(&mut self, bytes: &[u8], pos: usize, handler: &mut H) -> usize {
-        let mut consumed = 0;
+        let remaining = &bytes[pos..];
        
-        while pos + consumed < bytes.len() {
-            let ch = bytes[pos + consumed];
-            consumed += 1;
-            self.escape_len += 1;
+        // Use SIMD-accelerated search to find BEL (0x07), ESC (0x1B), or C1 ST (0x9C)
+        // memchr2 finds either of two bytes; we check ESC specially for ESC \ sequence
+        // First, try to find BEL or C1 ST (the simple terminators)
+        if let Some(term_pos) = memchr::memchr3(0x07, 0x1B, 0x9C, remaining) {
+            let terminator = remaining[term_pos];
            
-            // Check for max length
-            if self.escape_len > MAX_ESCAPE_LEN || self.osc_buffer.len() > MAX_OSC_LEN {
+            // Check max length before accepting
+            if self.escape_len + term_pos > MAX_ESCAPE_LEN || self.osc_buffer.len() + term_pos > MAX_OSC_LEN {
                log::debug!("OSC sequence too long, aborting");
                self.state = State::Normal;
-                return consumed;
+                return remaining.len();
            }
            
-            match ch {
-                // BEL terminates OSC
+            match terminator {
                0x07 => {
+                    // BEL terminator - copy data in bulk and dispatch
+                    self.osc_buffer.extend_from_slice(&remaining[..term_pos]);
                    handler.osc(&self.osc_buffer);
                    self.state = State::Normal;
-                    return consumed;
+                    self.escape_len += term_pos + 1;
+                    return term_pos + 1;
+                }
+                0x9C => {
+                    // C1 ST terminator - copy data in bulk and dispatch
+                    self.osc_buffer.extend_from_slice(&remaining[..term_pos]);
+                    handler.osc(&self.osc_buffer);
+                    self.state = State::Normal;
+                    self.escape_len += term_pos + 1;
+                    return term_pos + 1;
                }
-                // ESC \ (ST) terminates OSC
                0x1B => {
-                    // Need to peek at next byte
-                    if pos + consumed < bytes.len() && bytes[pos + consumed] == b'\\' {
-                        consumed += 1;
+                    // ESC found - check if followed by \ for ST
+                    if term_pos + 1 < remaining.len() && remaining[term_pos + 1] == b'\\' {
+                        // ESC \ (ST) terminator
+                        self.osc_buffer.extend_from_slice(&remaining[..term_pos]);
                        handler.osc(&self.osc_buffer);
                        self.state = State::Normal;
-                        return consumed;
-                    } else {
-                        // ESC not followed by \, dispatch what we have
+                        self.escape_len += term_pos + 2;
+                        return term_pos + 2;
+                    } else if term_pos + 1 < remaining.len() {
+                        // ESC not followed by \ - this is a new escape sequence
+                        // Copy everything before ESC and transition to Escape state
+                        self.osc_buffer.extend_from_slice(&remaining[..term_pos]);
                        handler.osc(&self.osc_buffer);
                        self.state = State::Escape;
-                        return consumed;
+                        self.escape_len += term_pos + 1;
+                        return term_pos + 1;
+                    } else {
+                        // ESC at end of buffer, need more data
+                        // Copy everything before ESC, keep ESC for next parse
+                        self.osc_buffer.extend_from_slice(&remaining[..term_pos]);
+                        self.escape_len += term_pos;
+                        return term_pos;
                    }
                }
-                // C1 ST (0x9C) terminates OSC
-                0x9C => {
-                    handler.osc(&self.osc_buffer);
-                    self.state = State::Normal;
-                    return consumed;
-                }
-                _ => {
-                    self.osc_buffer.push(ch);
-                }
+                _ => unreachable!(),
+            }
+        } else {
+            // No terminator found - check max length
+            if self.escape_len + remaining.len() > MAX_ESCAPE_LEN || self.osc_buffer.len() + remaining.len() > MAX_OSC_LEN {
+                log::debug!("OSC sequence too long, aborting");
+                self.state = State::Normal;
+                return remaining.len();
            }
-        }
            
-        consumed
+            // Buffer all remaining bytes for next parse call
+            self.osc_buffer.extend_from_slice(remaining);
+            self.escape_len += remaining.len();
+            return remaining.len();
+        }
    }

-    /// Process DCS/APC/PM/SOS sequence bytes (string commands terminated by ST).
+    /// Process DCS/APC/PM/SOS sequence bytes using SIMD-accelerated terminator search.
+    /// Like Kitty's find_st_terminator + accumulate_st_terminated_esc_code.
    fn consume_string_command<H: Handler>(&mut self, bytes: &[u8], pos: usize, handler: &mut H) -> usize {
-        let mut consumed = 0;
+        let remaining = &bytes[pos..];
        
-        while pos + consumed < bytes.len() {
-            let ch = bytes[pos + consumed];
-            consumed += 1;
-            self.escape_len += 1;
+        // Use SIMD-accelerated search to find ESC (0x1B) or C1 ST (0x9C)
+        if let Some(term_pos) = memchr::memchr2(0x1B, 0x9C, remaining) {
+            let terminator = remaining[term_pos];
            
-            // Check for max length
-            if self.escape_len > MAX_ESCAPE_LEN {
+            // Check max length before accepting
+            if self.escape_len + term_pos > MAX_ESCAPE_LEN {
                log::debug!("String command too long, aborting");
                self.state = State::Normal;
-                return consumed;
+                return remaining.len();
            }
            
-            match ch {
-                // ESC \ (ST) terminates
-                0x1B => {
-                    if pos + consumed < bytes.len() && bytes[pos + consumed] == b'\\' {
-                        consumed += 1;
-                        // Dispatch based on original state
-                        match self.state {
-                            State::Dcs => handler.dcs(&self.string_buffer),
-                            State::Apc => handler.apc(&self.string_buffer),
-                            State::Pm => handler.pm(&self.string_buffer),
-                            State::Sos => handler.sos(&self.string_buffer),
-                            _ => {}
-                        }
-                        self.state = State::Normal;
-                        return consumed;
-                    } else {
-                        self.string_buffer.push(ch);
-                    }
-                }
-                // C1 ST (0x9C) terminates
+            match terminator {
                0x9C => {
+                    // C1 ST terminator - copy data in bulk and dispatch
+                    self.string_buffer.extend_from_slice(&remaining[..term_pos]);
                    match self.state {
                        State::Dcs => handler.dcs(&self.string_buffer),
                        State::Apc => handler.apc(&self.string_buffer),
@@ -770,15 +784,55 @@ impl Parser {
                        _ => {}
                    }
                    self.state = State::Normal;
-                    return consumed;
+                    self.escape_len += term_pos + 1;
+                    return term_pos + 1;
                }
-                _ => {
-                    self.string_buffer.push(ch);
+                0x1B => {
+                    // ESC found - check if followed by \ for ST
+                    if term_pos + 1 < remaining.len() && remaining[term_pos + 1] == b'\\' {
+                        // ESC \ (ST) terminator
+                        self.string_buffer.extend_from_slice(&remaining[..term_pos]);
+                        match self.state {
+                            State::Dcs => handler.dcs(&self.string_buffer),
+                            State::Apc => handler.apc(&self.string_buffer),
+                            State::Pm => handler.pm(&self.string_buffer),
+                            State::Sos => handler.sos(&self.string_buffer),
+                            _ => {}
+                        }
+                        self.state = State::Normal;
+                        self.escape_len += term_pos + 2;
+                        return term_pos + 2;
+                    } else if term_pos + 1 < remaining.len() {
+                        // ESC not followed by \ - include ESC in data and continue
+                        // (Unlike OSC, string commands include raw ESC that isn't ST)
+                        self.string_buffer.extend_from_slice(&remaining[..=term_pos]);
+                        self.escape_len += term_pos + 1;
+                        // Continue searching from after this ESC
+                        let consumed = term_pos + 1;
+                        return consumed + self.consume_string_command(bytes, pos + consumed, handler);
+                    } else {
+                        // ESC at end of buffer, need more data
+                        // Copy everything before ESC, keep ESC for next parse
+                        self.string_buffer.extend_from_slice(&remaining[..term_pos]);
+                        self.escape_len += term_pos;
+                        return term_pos;
+                    }
                }
+                _ => unreachable!(),
+            }
+        } else {
+            // No terminator found - check max length
+            if self.escape_len + remaining.len() > MAX_ESCAPE_LEN {
+                log::debug!("String command too long, aborting");
+                self.state = State::Normal;
+                return remaining.len();
            }
-        }
            
-        consumed
+            // Buffer all remaining bytes for next parse call
+            self.string_buffer.extend_from_slice(remaining);
+            self.escape_len += remaining.len();
+            return remaining.len();
+        }
    }
 }