tweak-3.01/0000755000175300017530000000000010433030011013033 5ustar simonsimon00000000000000tweak-3.01/LICENCE0000644000175300017530000000205310433027777014051 0ustar simonsimon00000000000000Tweak is copyright 1994-2004 Simon Tatham. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tweak-3.01/actions.c0000644000175300017530000003735410433027777014704 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include #include static void act_exit (void); static void act_save (void); static void act_exitsave (void); static void act_top (void); static void act_pgup (void); static void act_up (void); static void act_home (void); static void act_left (void); static void act_right (void); static void act_end (void); static void act_down (void); static void act_pgdn (void); static void act_bottom (void); static void act_togins (void); static void act_chmode (void); extern void act_self_ins (void); /* this one must be external */ static void act_delete (void); static void act_delch (void); static void act_mark (void); static void act_cut (void); static void act_copy (void); static void act_paste (void); static void act_susp (void); static void act_goto (void); static void act_togstat (void); static void act_search (void); static void act_search_backwards (void); static void act_recentre (void); static void act_width (void); static void act_offset (void); #ifdef TEST_BUFFER static void act_diagnostics (void); #endif static Search *last_search = NULL; keyact parse_action (char *name) { char *names[] = { "exit", "top-of-file", "page-up", "move-up", "begin-line", "move-left", "move-right", "end-line", "move-down", "page-down", "bottom-of-file", "toggle-insert", "change-mode", "delete-left", "delete-right", "mark-place", "cut", "copy", "paste", "suspend", "goto-position", "toggle-status", "search", "search-back", "save-file", "exit-and-save", "screen-recentre", "new-width", "new-offset" #ifdef TEST_BUFFER , "diagnostics" #endif }; keyact actions[] = { act_exit, act_top, act_pgup, act_up, act_home, act_left, act_right, act_end, act_down, act_pgdn, act_bottom, act_togins, act_chmode, act_delete, act_delch, act_mark, act_cut, act_copy, act_paste, act_susp, act_goto, act_togstat, act_search, act_search_backwards, act_save, act_exitsave, act_recentre, act_width, act_offset #ifdef TEST_BUFFER , act_diagnostics #endif }; int i; for (i=0; i= 'a' && c <= 'z') c += 'A'-'a'; } while (c != 'Y' && c != 'N' && c != '\007'); if (c == 'Y') { act_save(); if (modified) return; /* couldn't save, so don't quit */ draw_scr(); /* update the ** on status line! */ } else if (c == '\007') { return; /* don't even quit */ } } finished = TRUE; } static void act_save(void) { static int backed_up = FALSE; if (!backed_up) { if (!backup_file()) { display_beep(); strcpy (message, "Unable to back up file!"); return; } backed_up = TRUE; } if (!save_file()) { display_beep(); strcpy (message, "Unable to save file!"); return; } modified = FALSE; } static void act_exitsave(void) { act_save(); draw_scr(); /* update ** on status line */ act_exit(); } static void act_top (void) { cur_pos = top_pos = 0; edit_type = !!edit_type; } static void act_pgup(void) { cur_pos -= (scrlines-1)*width; if (cur_pos < 0) { cur_pos = 0; edit_type = !!edit_type; } if (top_pos > cur_pos) top_pos = begline(cur_pos); } static void act_up(void) { cur_pos -= width; if (cur_pos < 0) { cur_pos = 0; edit_type = !!edit_type; } if (top_pos > cur_pos) top_pos = begline(cur_pos); } static void act_home(void) { cur_pos = begline(cur_pos); if (cur_pos < 0) cur_pos = 0; if (top_pos > cur_pos) top_pos = begline(cur_pos); edit_type = !!edit_type; } static void act_left(void) { if (edit_type == 2) { edit_type = 1; return; } else { cur_pos--; edit_type = 2*!!edit_type; if (cur_pos < 0) { cur_pos = 0; edit_type = !!edit_type; } if (top_pos > cur_pos) top_pos = begline(cur_pos); } } static void act_right(void) { fileoffset_t new_top; if (edit_type == 1) { if (cur_pos < file_size) edit_type = 2; return; } else { cur_pos++; if (cur_pos > file_size) cur_pos = file_size; new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; edit_type = !!edit_type; } } static void act_end(void) { fileoffset_t new_top; cur_pos = endline(cur_pos); edit_type = !!edit_type; if (cur_pos >= file_size) cur_pos = file_size; new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } static void act_down(void) { fileoffset_t new_top; cur_pos += width; if (cur_pos >= file_size) { cur_pos = file_size; edit_type = !!edit_type; } new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } static void act_pgdn(void) { fileoffset_t new_top; cur_pos += (scrlines-1) * width; if (cur_pos >= file_size) { cur_pos = file_size; edit_type = !!edit_type; } new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } static void act_bottom (void) { fileoffset_t new_top; cur_pos = file_size; edit_type = !!edit_type; new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } static void act_togins(void) { if (look_mode || fix_mode) { display_beep(); sprintf(message, "Can't engage Insert mode when in %s mode", (look_mode ? "LOOK" : "FIX")); insert_mode = FALSE; /* safety! */ } else insert_mode = !insert_mode; } static void act_chmode(void) { if (ascii_enabled) edit_type = !edit_type; /* 0 -> 1, [12] -> 0 */ else if (edit_type == 0) /* just in case */ edit_type = 1; } void act_self_ins(void) { int insert = insert_mode; unsigned char c; if (look_mode) { display_beep(); strcpy (message, "Can't modify file in LOOK mode"); return; } if (edit_type) { if (last_char >= '0' && last_char <= '9') last_char -= '0'; else if (last_char >= 'A' && last_char <= 'F') last_char -= 'A'-10; else if (last_char >= 'a' && last_char <= 'f') last_char -= 'a'-10; else { display_beep(); strcpy(message, "Not a valid character when in hex editing mode"); return; } } if ( (!insert || edit_type == 2) && cur_pos == file_size) { display_beep(); strcpy(message, "End of file reached"); return; } switch (edit_type) { case 0: /* ascii mode */ c = last_char; break; case 1: /* hex, first digit */ if (insert) c = 0; else buf_fetch_data(filedata, &c, 1, cur_pos); c &= 0xF; c |= 16 * last_char; break; case 2: /* hex, second digit */ buf_fetch_data(filedata, &c, 1, cur_pos); c &= 0xF0; c |= last_char; insert = FALSE; break; } if (insert) { buf_insert_data(filedata, &c, 1, cur_pos); file_size++; modified = TRUE; } else if (cur_pos < file_size) { buf_overwrite_data(filedata, &c, 1, cur_pos); modified = TRUE; } else { display_beep(); strcpy(message, "End of file reached"); } act_right(); } static void act_delete(void) { if (!insert_mode || (edit_type!=2 && cur_pos==0)) { display_beep(); strcpy (message, "Can't delete while not in Insert mode"); } else if (cur_pos > 0 || edit_type == 2) { act_left(); buf_delete (filedata, 1, cur_pos); file_size--; edit_type = !!edit_type; modified = TRUE; } } static void act_delch(void) { if (!insert_mode) { display_beep(); strcpy (message, "Can't delete while not in Insert mode"); } else if (cur_pos < file_size) { buf_delete (filedata, 1, cur_pos); file_size--; edit_type = !!edit_type; modified = TRUE; } } static void act_mark (void) { if (look_mode) { display_beep(); strcpy (message, "Can't cut or paste in LOOK mode"); marking = FALSE; /* safety */ return; } marking = !marking; mark_point = cur_pos; } static void act_cut (void) { fileoffset_t marktop, marksize; if (!marking || mark_point==cur_pos) { display_beep(); strcpy (message, "Set mark first"); return; } if (!insert_mode) { display_beep(); strcpy (message, "Can't cut while not in Insert mode"); return; } marktop = cur_pos; marksize = mark_point - cur_pos; if (marksize < 0) { marktop += marksize; marksize = -marksize; } if (cutbuffer) buf_free (cutbuffer); cutbuffer = buf_cut (filedata, marksize, marktop); file_size -= marksize; cur_pos = marktop; if (cur_pos < 0) cur_pos = 0; if (top_pos > cur_pos) top_pos = begline(cur_pos); edit_type = !!edit_type; modified = TRUE; marking = FALSE; } static void act_copy (void) { fileoffset_t marktop, marksize; if (!marking) { display_beep(); strcpy (message, "Set mark first"); return; } marktop = cur_pos; marksize = mark_point - cur_pos; if (marksize < 0) { marktop += marksize; marksize = -marksize; } if (cutbuffer) buf_free (cutbuffer); cutbuffer = buf_copy (filedata, marksize, marktop); marking = FALSE; } static void act_paste (void) { fileoffset_t cutsize, new_top; cutsize = buf_length (cutbuffer); if (!insert_mode) { if (cur_pos + cutsize > file_size) { display_beep(); strcpy (message, "Too close to end of file to paste"); return; } buf_delete (filedata, cutsize, cur_pos); file_size -= cutsize; } buf_paste (filedata, cutbuffer, cur_pos); modified = TRUE; cur_pos += cutsize; file_size += cutsize; edit_type = !!edit_type; new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } static void act_susp (void) { suspend(); } static void act_goto (void) { char buffer[80]; fileoffset_t position, new_top; int error; if (!get_str("Enter position to go to: ", buffer, FALSE)) return; /* user break */ position = parse_num (buffer, &error); if (error) { display_beep(); strcpy (message, "Unable to parse position value"); return; } if (position < 0 || position > file_size) { display_beep(); strcpy (message, "Position is outside bounds of file"); return; } cur_pos = position; edit_type = !!edit_type; new_top = cur_pos - (scrlines-1) * width; if (new_top < 0) new_top = 0; new_top = begline(new_top); if (top_pos > cur_pos) top_pos = begline(cur_pos); if (top_pos < new_top) top_pos = new_top; } static void act_togstat (void) { if (statfmt == decstatus) statfmt = hexstatus; else statfmt = decstatus; } static int search_prompt(char *withdef, char *withoutdef) { char buffer[80]; int len; if (!get_str(last_search ? withdef : withoutdef, buffer, TRUE)) return 0; /* user break */ if (!last_search && !*buffer) { strcpy (message, "Search aborted."); return 0; } if (!*buffer) { len = last_search->len; } else { len = parse_quoted (buffer); if (len == -1) { display_beep(); strcpy (message, "Invalid escape sequence in search string"); return 0; } if (last_search) free_search(last_search); last_search = build_search (buffer, len); } return 1; } static void act_search (void) { int len; fileoffset_t posn, dfapos; DFA dfa; static unsigned char sblk[SEARCH_BLK]; static char withdef[] = "Search forward (default=last): "; static char withoutdef[] = "Search forward: "; if (!search_prompt(withdef, withoutdef)) return; dfa = last_search->forward; len = last_search->len; dfapos = 0; for (posn = cur_pos+1; posn < file_size; posn++) { unsigned char *q; int size = SEARCH_BLK; if (size > file_size-posn) size = file_size-posn; buf_fetch_data (filedata, sblk, size, posn); q = sblk; while (size--) { posn++; dfapos = dfa[dfapos][*q++]; if (dfapos == len) { fileoffset_t new_top; cur_pos = posn - len; edit_type = !!edit_type; new_top = cur_pos - (scrlines-1) * width; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; return; } } } strcpy (message, "Not found."); } static void act_search_backwards (void) { int len; fileoffset_t posn, dfapos; DFA dfa; static unsigned char sblk[SEARCH_BLK]; static char withdef[] = "Search backward (default=last): "; static char withoutdef[] = "Search backward: "; if (!search_prompt(withdef, withoutdef)) return; dfa = last_search->reverse; len = last_search->len; dfapos = 0; posn = cur_pos + len - 1; if (posn >= file_size) posn = file_size; for (; posn >= 0; posn--) { unsigned char *q; int size = SEARCH_BLK; if (size > posn) size = posn; buf_fetch_data (filedata, sblk, size, posn-size); q = sblk + size; while (size--) { posn--; dfapos = dfa[dfapos][*--q]; if (dfapos == len) { fileoffset_t new_top; cur_pos = posn; edit_type = !!edit_type; new_top = cur_pos - (scrlines-1) * width; new_top = begline(new_top); if (top_pos > new_top) top_pos = new_top; return; } } } strcpy (message, "Not found."); } static void act_recentre (void) { top_pos = cur_pos - (display_rows-2)/2 * width; if (top_pos < 0) top_pos = 0; top_pos = begline(top_pos); } static void act_width (void) { char buffer[80]; char prompt[80]; fileoffset_t w; fileoffset_t new_top; int error; sprintf (prompt, "Enter screen width in bytes (now %"OFF"d): ", width); if (!get_str (prompt, buffer, FALSE)) return; w = parse_num (buffer, &error); if (error) { display_beep(); strcpy (message, "Unable to parse width value"); return; } if (w > 0) { width = w; fix_offset(); new_top = cur_pos - (scrlines-1) * width; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } } static void act_offset (void) { char buffer[80]; char prompt[80]; fileoffset_t o; fileoffset_t new_top; int error; sprintf (prompt, "Enter start-of-file offset in bytes (now %"OFF"d): ", realoffset); if (!get_str (prompt, buffer, FALSE)) return; o = parse_num (buffer, &error); if (error) { display_beep(); strcpy (message, "Unable to parse offset value"); return; } if (o >= 0) { realoffset = o; fix_offset(); new_top = cur_pos - (scrlines-1) * width; new_top = begline(new_top); if (top_pos < new_top) top_pos = new_top; } } #ifdef TEST_BUFFER static void act_diagnostics(void) { extern void buffer_diagnostic(buffer *buf, char *title); buffer_diagnostic(filedata, "filedata"); buffer_diagnostic(cutbuffer, "cutbuffer"); } #endif tweak-3.01/btree.c0000644000175300017530000021321010147367204014322 0ustar simonsimon00000000000000/* * Flexible B-tree implementation. Supports reference counting for * copy-on-write, user-defined node properties, and variable * degree. * * This file is copyright 2001,2004 Simon Tatham. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL SIMON TATHAM BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* * TODO: * * Possibly TODO in future, but may not be sensible in this code * architecture: * * - user write properties. * * this all happens during write_unlock(), I think. Except * that we'll now need an _internal_ write_unlock() which * does everything except user write properties. Sigh. * * note that we also need a transform function for elements * (rot13 will certainly require this, and reverse will * require it if the elements themselves are in some way * reversible). * * Still untested: * - searching on user read properties. * - user-supplied copy function. * - bt_add when element already exists. * - bt_del when element doesn't. * - splitpos with before==TRUE. * - split() on sorted elements (but it should be fine). * - bt_replace, at all (it won't be useful until we get user read * properties). * - bt_index_w (won't make much sense until we start using * user-supplied copy fn). */ #include #include #include #ifdef TEST #include #include #endif #include "btree.h" #ifdef TEST static void set_invalid_property(void *prop); #endif /* ---------------------------------------------------------------------- * Type definitions. */ typedef union nodecomponent nodecomponent; typedef nodecomponent *nodeptr; /* * For type-checking purposes, and to ensure I don't accidentally * confuse node_addr with node_ptr during implementation, I'll * define node_addr for the in-memory case as being a struct * containing only a nodeptr. * * This unfortunately needs to go in btree.h so that clients * writing user properties can know about the nodecomponent * structure. */ typedef struct { nodeptr p; } node_addr; /* * A B-tree node is a horrible thing when you're trying to be * flexible. It is of variable size, and it contains a variety of * distinct types of thing: nodes, elements, some counters, some * user-defined properties ... it's a horrible thing. So we define * it as an array of unions, each union being either an `int' or a * `bt_element_t' or a `node_addr'... */ union nodecomponent { int i; node_addr na; bt_element_t ep; }; static const node_addr NODE_ADDR_NULL = { NULL }; /* * The array of nodecomponents will take the following form: * * - (maxdegree) child pointers. * - (maxdegree-1) element pointers. * - one subtree count (current number of child pointers that are * valid; note that `valid' doesn't imply non-NULL). * - one element count. * - one reference count. */ struct btree { int mindegree; /* min number of subtrees */ int maxdegree; /* max number of subtrees */ int depth; /* helps to store this explicitly */ node_addr root; cmpfn_t cmp; copyfn_t copy; freefn_t freeelt; int propsize, propalign, propoffset; propmakefn_t propmake; propmergefn_t propmerge; void *userstate; /* passed to all user functions */ }; /* ---------------------------------------------------------------------- * Memory management routines and other housekeeping. */ #ifdef HAVE_ALLOCA # define ialloc(x) alloca(x) # define ifree(x) #else # define ialloc(x) smalloc(x) # define ifree(x) sfree(x) #endif #define new1(t) ( (t *) smalloc(sizeof(t)) ) #define newn(t, n) ( (t *) smalloc((n) * sizeof(t)) ) #define inew1(t) ( (t *) ialloc(sizeof(t)) ) #define inewn(t, n) ( (t *) ialloc((n) * sizeof(t)) ) static void *smalloc(size_t size) { void *ret = malloc(size); if (!ret) abort(); return ret; } static void sfree(void *p) { free(p); } #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif /* We could probably do with more compiler-specific branches of this #if. */ #if defined(__GNUC__) #define INLINE __inline #else #define INLINE #endif /* Hooks into the low-level code for test purposes. */ #ifdef TEST void testlock(int write, int set, nodeptr n); #else #define testlock(w,s,n) #endif /* ---------------------------------------------------------------------- * Low-level helper routines, which understand the in-memory format * of a node and know how to read-lock and write-lock. */ /* * Read and write the node_addr of a child. */ static INLINE node_addr bt_child(btree *bt, nodeptr n, int index) { return n[index].na; } static INLINE void bt_set_child(btree *bt, nodeptr n, int index, node_addr value) { n[index].na = value; } /* * Read and write the address of an element. */ static INLINE bt_element_t bt_element(btree *bt, nodeptr n, int index) { return n[bt->maxdegree + index].ep; } static INLINE void bt_set_element(btree *bt, nodeptr n, int index, bt_element_t value) { n[bt->maxdegree + index].ep = value; } /* * Give the number of subtrees currently present in an element. */ static INLINE int bt_subtrees(btree *bt, nodeptr n) { return n[bt->maxdegree*2-1].i; } #define bt_elements(bt,n) (bt_subtrees(bt,n) - 1) /* * Give the minimum and maximum number of subtrees allowed in a * node. */ static INLINE int bt_min_subtrees(btree *bt) { return bt->mindegree; } static INLINE int bt_max_subtrees(btree *bt) { return bt->maxdegree; } /* * Return the count of items, and the user properties, in a * particular subtree of a node. * * Note that in the in-memory form of the tree, this breaks the * read-locking semantics, by reading the counts out of the child * nodes without bothering to lock them. We're allowed to do this * because this function is implemented at the same very low level * as the implementation of bt_read_lock(), so we're allowed to * know that read locking actually doesn't do anything. */ static INLINE int bt_child_count(btree *bt, nodeptr n, int index) { if (n[index].na.p) return n[index].na.p[bt->maxdegree*2].i; else return 0; } static INLINE void *bt_child_prop(btree *bt, nodeptr n, int index) { if (n[index].na.p) return (char *)n[index].na.p + bt->propoffset; else return NULL; } /* * Return the count of items in a whole node. */ static INLINE int bt_node_count(btree *bt, nodeptr n) { return n[bt->maxdegree*2].i; } /* * Determine whether a node is a leaf node or not. */ static INLINE int bt_is_leaf(btree *bt, nodeptr n) { return n[0].na.p == NULL; } /* * Create a new write-locked node, and return a pointer to it. */ static INLINE nodeptr bt_new_node(btree *bt, int nsubtrees) { nodeptr ret = (nodecomponent *)smalloc(bt->propoffset + bt->propsize); ret[bt->maxdegree*2-1].i = nsubtrees; ret[bt->maxdegree*2+1].i = 1; /* reference count 1 */ #ifdef TEST set_invalid_property(ret + bt->maxdegree * 2 + 2); #else memset((char *)ret + bt->propoffset, 0, bt->propsize); #endif testlock(TRUE, TRUE, ret); return ret; } /* * Destroy a node (must be write-locked). */ static INLINE void bt_destroy_node(btree *bt, nodeptr n) { testlock(TRUE, FALSE, n); /* Free the property. */ bt->propmerge(bt->userstate, NULL, NULL, n + bt->maxdegree * 2 + 2); sfree(n); } /* * Take an existing node and prepare to re-use it in a new context. */ static INLINE nodeptr bt_reuse_node(btree *bt, nodeptr n, int nsubtrees) { testlock(TRUE, FALSE, n); testlock(TRUE, TRUE, n); n[bt->maxdegree*2-1].i = nsubtrees; return n; } /* * Return an extra reference to a node, for purposes of cloning. So * we have to update its reference count as well. */ static INLINE node_addr bt_ref_node(btree *bt, node_addr n) { if (n.p) n.p[bt->maxdegree*2+1].i++; return n; } /* * Drop a node's reference count, for purposes of freeing. Returns * the new reference count. Typically this will be tested against * zero to see if the node needs to be physically freed; hence a * NULL node_addr causes a return of 1 (because this isn't * necessary). */ static INLINE int bt_unref_node(btree *bt, node_addr n) { if (n.p) { n.p[bt->maxdegree*2+1].i--; return n.p[bt->maxdegree*2+1].i; } else return 1; /* a NULL node is considered OK */ } /* * Clone a node during write unlocking, if its reference count is * more than one. */ static nodeptr bt_clone_node(btree *bt, nodeptr n) { int i; nodeptr ret = (nodecomponent *)smalloc(bt->propoffset + bt->propsize); memcpy(ret, n, (bt->maxdegree*2+1) * sizeof(nodecomponent)); if (bt->copy) { for (i = 0; i < bt_elements(bt, ret); i++) { bt_element_t *e = bt_element(bt, ret, i); bt_set_element(bt, ret, i, bt->copy(bt->userstate, e)); } } ret[bt->maxdegree*2+1].i = 1; /* clone has reference count 1 */ n[bt->maxdegree*2+1].i--; /* drop original's ref count by one */ /* * At this low level, we're allowed to reach directly into the * subtrees to fiddle with their reference counts without * having to lock them. */ for (i = 0; i < bt_subtrees(bt, ret); i++) { node_addr na = bt_child(bt, ret, i); if (na.p) na.p[bt->maxdegree*2+1].i++; /* inc ref count of each child */ } /* * Copy the user property explicitly (in case it contains a * pointer to an allocated area). */ memset((char *)ret + bt->propoffset, 0, bt->propsize); bt->propmerge(bt->userstate, NULL, n + bt->maxdegree * 2 + 2, ret + bt->maxdegree * 2 + 2); return ret; } /* * Return the node_addr for a currently locked node. NB that this * means node movement must take place during _locking_ rather than * unlocking! */ static INLINE node_addr bt_node_addr(btree *bt, nodeptr n) { node_addr ret; ret.p = n; return ret; } /* * The bt_write_lock and bt_read_lock functions should gracefully * handle being asked to write-lock a null node pointer, and just * return a null nodeptr. */ static INLINE nodeptr bt_write_lock_child(btree *bt, nodeptr a, int index) { node_addr addr = bt_child(bt, a, index); if (addr.p && addr.p[bt->maxdegree*2+1].i > 1) { nodeptr clone = bt_clone_node(bt, addr.p); bt_set_child(bt, a, index, bt_node_addr(bt, clone)); testlock(TRUE, TRUE, clone); return clone; } testlock(TRUE, TRUE, addr.p); return addr.p; } static INLINE nodeptr bt_write_lock_root(btree *bt) { node_addr addr = bt->root; if (addr.p && addr.p[bt->maxdegree*2+1].i > 1) { nodeptr clone = bt_clone_node(bt, addr.p); bt->root = bt_node_addr(bt, clone); testlock(TRUE, TRUE, clone); return clone; } testlock(TRUE, TRUE, addr.p); return addr.p; } static INLINE nodeptr bt_read_lock(btree *bt, node_addr a) { testlock(FALSE, TRUE, a.p); return a.p; } #define bt_read_lock_root(bt) (bt_read_lock(bt, (bt)->root)) #define bt_read_lock_child(bt,a,index) (bt_read_lock(bt,bt_child(bt,a,index))) static INLINE void bt_write_relock(btree *bt, nodeptr n, int props) { int i, ns, count; /* * Update the count in the node. */ ns = bt_subtrees(bt, n); count = ns-1; /* count the elements */ for (i = 0; i < ns; i++) count += bt_child_count(bt, n, i); n[bt->maxdegree*2].i = count; testlock(TRUE, FALSE, n); testlock(TRUE, TRUE, n); /* * Update user read properties. */ if (props && bt->propsize) { void *prevprop, *eltprop, *thisprop, *childprop; prevprop = NULL; eltprop = ialloc(bt->propsize); thisprop = (void *)((char *)n + bt->propoffset); for (i = 0; i < ns; i++) { /* Merge a subtree's property into this one. * Initially prevprop==NULL, meaning to just copy. */ if ( (childprop = bt_child_prop(bt, n, i)) != NULL ) { bt->propmerge(bt->userstate, prevprop, childprop, thisprop); prevprop = thisprop; } if (i < ns-1) { /* Now merge in the separating element. */ bt->propmake(bt->userstate, bt_element(bt, n, i), eltprop); bt->propmerge(bt->userstate, prevprop, eltprop, thisprop); prevprop = thisprop; } } ifree(eltprop); } } static INLINE node_addr bt_write_unlock_internal(btree *bt, nodeptr n, int props) { node_addr ret; bt_write_relock(bt, n, props); testlock(TRUE, FALSE, n); ret.p = n; return ret; } static INLINE node_addr bt_write_unlock(btree *bt, nodeptr n) { return bt_write_unlock_internal(bt, n, TRUE); } static INLINE void bt_read_unlock(btree *bt, nodeptr n) { /* * For trees in memory, we do nothing here, except run some * optional testing. */ testlock(FALSE, FALSE, n); } /* ---------------------------------------------------------------------- * Higher-level helper functions, which should be independent of * the knowledge of precise node structure in the above code. */ /* * Return the count of items below a node that appear before the * start of a given subtree. */ static int bt_child_startpos(btree *bt, nodeptr n, int index) { int pos = 0; while (index > 0) { index--; pos += bt_child_count(bt, n, index) + 1; /* 1 for separating elt */ } return pos; } /* * Create a new root node for a tree. */ static void bt_new_root(btree *bt, node_addr left, node_addr right, bt_element_t element) { nodeptr n; n = bt_new_node(bt, 2); bt_set_child(bt, n, 0, left); bt_set_child(bt, n, 1, right); bt_set_element(bt, n, 0, element); bt->root = bt_write_unlock(bt, n); bt->depth++; } /* * Discard the root node of a tree, and enshrine a new node as the * root. Expects to be passed a write-locked nodeptr to the old * root. */ static void bt_shift_root(btree *bt, nodeptr n, node_addr na) { bt_destroy_node(bt, n); bt->root = na; bt->depth--; } /* * Given a numeric index within a node, find which subtree we would * descend to in order to find that index. * * Updates `pos' to give the numeric index within the subtree * found. Also returns `ends' (if non-NULL), which has bit 0 set if * the index is at the very left edge of the subtree, and/or bit 1 * if it's at the very right edge. * * Return value is the number of the subtree (0 upwards). */ #define ENDS_NONE 0 #define ENDS_LEFT 1 #define ENDS_RIGHT 2 #define ENDS_BOTH 3 static int bt_lookup_pos(btree *bt, nodeptr n, int *pos, int *ends) { int child = 0; int nchildren = bt_subtrees(bt, n); while (child < nchildren) { int count = bt_child_count(bt, n, child); if (*pos <= count) { if (ends) { *ends = 0; if (*pos == count) *ends |= ENDS_RIGHT; if (*pos == 0) *ends |= ENDS_LEFT; } return child; } *pos -= count + 1; /* 1 for the separating element */ child++; } return -1; /* ran off the end; shouldn't happen */ } /* * Given an element to search for within a node, find either the * element, or which subtree we would descend to to continue * searching for that element. * * Return value is either the index of the element, or the index of * the subtree (both 0 upwards). `is_elt' returns FALSE or TRUE * respectively. * * Since this may be used by bt_find() with an alternative cmpfn_t, * we always pass the input element as the first argument to cmp. */ static int bt_lookup_cmp(btree *bt, nodeptr n, bt_element_t element, cmpfn_t cmp, int *is_elt) { int mintree = 0, maxtree = bt_subtrees(bt, n)-1; while (mintree < maxtree) { int elt = (maxtree + mintree) / 2; int c = cmp(bt->userstate, element, bt_element(bt, n, elt)); if (c == 0) { *is_elt = TRUE; return elt; } else if (c < 0) { /* * `element' is less than element `elt'. So it can be * in subtree number `elt' at the highest. */ maxtree = elt; } else { /* c > 0 */ /* * `element' is greater than element `elt'. So it can * be in subtree number (elt+1) at the lowest. */ mintree = elt+1; } } /* * If we reach here without returning, we must have narrowed * our search to the point where mintree = maxtree. So the * element is not in the node itself and we know which subtree * to search next. */ assert(mintree == maxtree); *is_elt = FALSE; return mintree; } /* * Generic transformations on B-tree nodes. * * This function divides essentially into an input side and an * output side. The input side accumulates a list of items * node,element,node,element,...,element,node; the output side * writes those items into either one or two nodes. * * `intype' can be: * * - NODE_AS_IS. The input list is the contents of in1, followed * by inelt, followed by the contents of in2. The `extra' * parameters are unused, as is `inaux'. * * - NODE_ADD_ELT. `in2' is unused. The input list is the contents * of `in1', but with subtree pointer number `inaux' replaced by * extra1/inelt/extra2. * * - NODE_DEL_ELT. `in2' and `inelt' are unused, as is `extra2'. * The input list is the contents of `in1', but with element * pointer number `inaux' and its surrounding two subtrees * replaced by extra1. * * Having obtained the input list, it is then written to one or two * output nodes. If `splitpos' is NODE_JOIN, everything is written * into one output node `out1'. Otherwise, `splitpos' is treated as * an element index within the input list; that element is returned * in `outelt', and the contents of the list is divided there and * returned in nodes `out1' and `out2'. * * This function will re-use nodes in the `obvious' order. If two * nodes are passed in and two nodes are output, they'll be the * same nodes; if one node is passed in and one node output, it * will be the same node too. If two are passed in and only one * output, the first one will be used and the second destroyed; if * one node is passed in and two are output, the one passed in will * be the first of those returned, and the second will be new. */ #define NODE_AS_IS 1 #define NODE_ADD_ELT 2 #define NODE_DEL_ELT 3 #define NODE_JOIN -1 static void bt_xform(btree *bt, int intype, int inaux, nodeptr in1, nodeptr in2, bt_element_t inelt, node_addr extra1, node_addr extra2, int splitpos, nodeptr *out1, nodeptr *out2, bt_element_t *outelt) { node_addr *nodes; bt_element_t *elements; nodeptr ret1, ret2; int n1, n2, off2, i, j; nodes = inewn(node_addr, 2 * bt_max_subtrees(bt)); elements = inewn(bt_element_t, 2 * bt_max_subtrees(bt)); /* * Accumulate the input list. */ switch(intype) { case NODE_AS_IS: n1 = bt_subtrees(bt, in1); n2 = bt_subtrees(bt, in2); off2 = 0; break; case NODE_ADD_ELT: in2 = in1; n1 = inaux+1; n2 = bt_subtrees(bt, in1) - inaux; off2 = inaux; break; case NODE_DEL_ELT: in2 = in1; n1 = inaux+1; n2 = bt_subtrees(bt, in1) - inaux - 1; off2 = inaux+1; break; } i = j = 0; while (j < n1) { nodes[i] = bt_child(bt, in1, j); if (j+1 < n1) elements[i] = bt_element(bt, in1, j); i++, j++; } if (intype == NODE_DEL_ELT) { i--; } j = 0; while (j < n2) { nodes[i] = bt_child(bt, in2, off2+j); if (j+1 < n2) elements[i] = bt_element(bt, in2, off2+j); i++, j++; } switch (intype) { case NODE_AS_IS: elements[n1-1] = inelt; break; case NODE_ADD_ELT: nodes[n1-1] = extra1; nodes[n1] = extra2; elements[n1-1] = inelt; break; case NODE_DEL_ELT: nodes[n1-1] = extra1; break; } /* * Now determine how many subtrees go in each output node, and * actually create the nodes to be returned. */ if (splitpos != NODE_JOIN) { n1 = splitpos+1, n2 = i - splitpos - 1; if (outelt) *outelt = elements[splitpos]; } else { n1 = i, n2 = 0; } ret1 = bt_reuse_node(bt, in1, n1); if (intype == NODE_AS_IS && in2) { /* We have a second input node. */ if (n2) ret2 = bt_reuse_node(bt, in2, n2); else bt_destroy_node(bt, in2); } else { /* We have no second input node. */ if (n2) ret2 = bt_new_node(bt, n2); else ret2 = NULL; } if (out1) *out1 = ret1; if (out2) *out2 = ret2; for (i = 0; i < n1; i++) { bt_set_child(bt, ret1, i, nodes[i]); if (i+1 < n1) bt_set_element(bt, ret1, i, elements[i]); } if (n2) { if (outelt) *outelt = elements[n1-1]; for (i = 0; i < n2; i++) { bt_set_child(bt, ret2, i, nodes[n1+i]); if (i+1 < n2) bt_set_element(bt, ret2, i, elements[n1+i]); } } ifree(nodes); ifree(elements); } /* * Fiddly little compare functions for use in special cases of * findrelpos. One always returns +1 (a > b), the other always * returns -1 (a < b). */ static int bt_cmp_greater(void *state, const bt_element_t a, const bt_element_t b) { return +1; } static int bt_cmp_less(void *state, const bt_element_t a, const bt_element_t b) { return -1; } /* ---------------------------------------------------------------------- * User-visible administration routines. */ btree *bt_new(cmpfn_t cmp, copyfn_t copy, freefn_t freeelt, int propsize, int propalign, propmakefn_t propmake, propmergefn_t propmerge, void *state, int mindegree) { btree *ret; ret = new1(btree); ret->mindegree = mindegree; ret->maxdegree = 2*mindegree; ret->depth = 0; /* not even a root right now */ ret->root = NODE_ADDR_NULL; ret->cmp = cmp; ret->copy = copy; ret->freeelt = freeelt; ret->propsize = propsize; ret->propalign = propalign; ret->propoffset = sizeof(nodecomponent) * (ret->maxdegree*2 + 2); if (propalign > 0) { ret->propoffset += propalign - 1; ret->propoffset -= ret->propoffset % propalign; } ret->propmake = propmake; ret->propmerge = propmerge; ret->userstate = state; return ret; } static void bt_free_node(btree *bt, nodeptr n) { int i; for (i = 0; i < bt_subtrees(bt, n); i++) { node_addr na; nodeptr n2; na = bt_child(bt, n, i); if (!bt_unref_node(bt, na)) { n2 = bt_write_lock_child(bt, n, i); bt_free_node(bt, n2); } } if (bt->freeelt) { for (i = 0; i < bt_subtrees(bt, n)-1; i++) bt->freeelt(bt->userstate, bt_element(bt, n, i)); } bt_destroy_node(bt, n); } void bt_free(btree *bt) { nodeptr n; if (!bt_unref_node(bt, bt->root)) { n = bt_write_lock_root(bt); bt_free_node(bt, n); } sfree(bt); } btree *bt_clone(btree *bt) { btree *bt2; bt2 = bt_new(bt->cmp, bt->copy, bt->freeelt, bt->propsize, bt->propalign, bt->propmake, bt->propmerge, bt->userstate, bt->mindegree); bt2->depth = bt->depth; bt2->root = bt_ref_node(bt, bt->root); return bt2; } /* * Nice simple function to count the size of a tree. */ int bt_count(btree *bt) { int count; nodeptr n; n = bt_read_lock_root(bt); if (n) { count = bt_node_count(bt, n); bt_read_unlock(bt, n); return count; } else { return 0; } } /* ---------------------------------------------------------------------- * Actual B-tree algorithms. */ /* * Find an element by numeric index. bt_index_w is the same, but * works with write locks instead of read locks, so it guarantees * to return an element with only one reference to it. (You'd use * this if you were using tree cloning, and wanted to modify the * element once you'd found it.) */ bt_element_t bt_index(btree *bt, int index) { nodeptr n, n2; int child, ends; n = bt_read_lock_root(bt); if (index < 0 || index >= bt_node_count(bt, n)) { bt_read_unlock(bt, n); return NULL; } while (1) { child = bt_lookup_pos(bt, n, &index, &ends); if (ends & ENDS_RIGHT) { bt_element_t ret = bt_element(bt, n, child); bt_read_unlock(bt, n); return ret; } n2 = bt_read_lock_child(bt, n, child); bt_read_unlock(bt, n); n = n2; assert(n != NULL); } } bt_element_t bt_index_w(btree *bt, int index) { nodeptr n, n2; int nnodes, child, ends; nodeptr *nodes; bt_element_t ret; nodes = inewn(nodeptr, bt->depth+1); nnodes = 0; n = bt_write_lock_root(bt); if (index < 0 || index >= bt_node_count(bt, n)) { bt_write_unlock(bt, n); return NULL; } while (1) { nodes[nnodes++] = n; child = bt_lookup_pos(bt, n, &index, &ends); if (ends & ENDS_RIGHT) { ret = bt_element(bt, n, child); break; } n2 = bt_write_lock_child(bt, n, child); n = n2; assert(n != NULL); } while (nnodes-- > 0) bt_write_unlock(bt, nodes[nnodes]); return ret; } /* * Search for an element by sorted order. */ bt_element_t bt_findrelpos(btree *bt, bt_element_t element, cmpfn_t cmp, int relation, int *index) { nodeptr n, n2; int child, is_elt; bt_element_t gotit; int pos = 0; int count; if (!cmp) cmp = bt->cmp; /* * Special case: relation LT/GT and element NULL means get an * extreme element of the tree. We do this by fudging the * compare function so that our NULL element will be considered * infinitely large or infinitely small. */ if (element == NULL) { assert(relation == BT_REL_LT || relation == BT_REL_GT); if (relation == BT_REL_LT) cmp = bt_cmp_greater; /* always returns a > b */ else cmp = bt_cmp_less; /* always returns a < b */ } gotit = NULL; n = bt_read_lock_root(bt); if (!n) return NULL; count = bt_node_count(bt, n); while (n) { child = bt_lookup_cmp(bt, n, element, cmp, &is_elt); if (is_elt) { pos += bt_child_startpos(bt, n, child+1) - 1; gotit = bt_element(bt, n, child); bt_read_unlock(bt, n); break; } else { pos += bt_child_startpos(bt, n, child); n2 = bt_read_lock_child(bt, n, child); bt_read_unlock(bt, n); n = n2; } } /* * Now all nodes are unlocked, and we are _either_ (a) holding * an element in `gotit' whose index we have in `pos', _or_ (b) * holding nothing in `gotit' but we know the index of the * next-higher element. */ if (gotit) { /* * We have the real element. For EQ, LE and GE relations we * can now just return it; otherwise we must return the * next element down or up. */ if (relation == BT_REL_LT) gotit = bt_index(bt, --pos); else if (relation == BT_REL_GT) gotit = bt_index(bt, ++pos); } else { /* * We don't have the real element. For EQ relation we now * just give up; for everything else we return the next * element down or up. */ if (relation == BT_REL_LT || relation == BT_REL_LE) gotit = bt_index(bt, --pos); else if (relation == BT_REL_GT || relation == BT_REL_GE) gotit = bt_index(bt, pos); } if (gotit && index) *index = pos; return gotit; } bt_element_t bt_findrel(btree *bt, bt_element_t element, cmpfn_t cmp, int relation) { return bt_findrelpos(bt, element, cmp, relation, NULL); } bt_element_t bt_findpos(btree *bt, bt_element_t element, cmpfn_t cmp, int *index) { return bt_findrelpos(bt, element, cmp, BT_REL_EQ, index); } bt_element_t bt_find(btree *bt, bt_element_t element, cmpfn_t cmp) { return bt_findrelpos(bt, element, cmp, BT_REL_EQ, NULL); } /* * Find an element by property-based search. Returns the element * (if one is selected - the search can also terminate by * descending to a nonexistent subtree of a leaf node, equivalent * to selecting the _gap_ between two elements); also returns the * index of either the element or the gap in `*index' if `index' is * non-NULL. */ bt_element_t bt_propfind(btree *bt, searchfn_t search, void *sstate, int *index) { nodeptr n, n2; int i, j, count, is_elt; void **props; int *counts; bt_element_t *elts; bt_element_t *e = NULL; props = inewn(void *, bt->maxdegree); counts = inewn(int, bt->maxdegree); elts = inewn(bt_element_t, bt->maxdegree); n = bt_read_lock_root(bt); count = 0; while (n) { int ntrees = bt_subtrees(bt, n); /* * Prepare the arguments to the search function. */ for (i = 0; i < ntrees; i++) { props[i] = bt_child_prop(bt, n, i); counts[i] = bt_child_count(bt, n, i); if (i < ntrees-1) elts[i] = bt_element(bt, n, i); } /* * Call the search function. */ i = search(bt->userstate, sstate, ntrees, props, counts, elts, &is_elt); if (!is_elt) { /* * Descend to subtree i. Update `count' to consider * everything (both subtrees and elements) before that * subtree. */ for (j = 0; j < i; j++) count += 1 + bt_child_count(bt, n, j); n2 = bt_read_lock_child(bt, n, i); bt_read_unlock(bt, n); n = n2; } else { /* * Return element i. Update `count' to consider * everything (both subtrees and elements) before that * element. */ for (j = 0; j <= i; j++) count += 1 + bt_child_count(bt, n, j); count--; /* don't count element i itself */ e = bt_element(bt, n, i); bt_read_unlock(bt, n); break; } } ifree(props); ifree(counts); ifree(elts); if (index) *index = count; return e; } /* * Replace the element at a numeric index by a new element. Returns * the old element. * * Can also be used when the new element is the _same_ as the old * element, but has changed in some way that will affect user * properties. */ bt_element_t bt_replace(btree *bt, bt_element_t element, int index) { nodeptr n; nodeptr *nodes; bt_element_t ret; int nnodes, child, ends; nodes = inewn(nodeptr, bt->depth+1); nnodes = 0; n = bt_write_lock_root(bt); if (index < 0 || index >= bt_node_count(bt, n)) { bt_write_unlock(bt, n); return NULL; } while (1) { nodes[nnodes++] = n; child = bt_lookup_pos(bt, n, &index, &ends); if (ends & ENDS_RIGHT) { ret = bt_element(bt, n, child); bt_set_element(bt, n, child, element); break; } n = bt_write_lock_child(bt, n, child); assert(n != NULL); } while (nnodes-- > 0) bt_write_unlock(bt, nodes[nnodes]); return ret; } /* * Add at a specific position. As we search down the tree we must * write-lock every node we meet, since otherwise we might fail to * clone nodes that will end up pointing to different things. */ void bt_addpos(btree *bt, bt_element_t element, int pos) { nodeptr n; node_addr left, right, single; nodeptr *nodes; int *childposns; int nnodes, child; /* * Since in a reference-counted tree we can't have parent * links, we will have to use O(depth) space to store the list * of nodeptrs we have gone through, so we can un-write-lock * them when we've finished. We also store the subtree index we * descended to at each stage. */ nodes = inewn(nodeptr, bt->depth+1); childposns = inewn(int, bt->depth+1); nnodes = 0; n = bt_write_lock_root(bt); assert(pos >= 0 && pos <= (n ? bt_node_count(bt, n) : 0)); /* * Scan down the tree, write-locking nodes, until we find the * empty subtree where we want to insert the item. */ while (n) { nodes[nnodes] = n; child = bt_lookup_pos(bt, n, &pos, NULL); childposns[nnodes] = child; nnodes++; n = bt_write_lock_child(bt, n, child); } left = right = NODE_ADDR_NULL; /* * Now nodes[nnodes-1] wants to have subtree index * childposns[nnodes-1] replaced by the node/element/node triple * (left,element,right). Propagate this up the tree until we * can stop. */ while (nnodes-- > 0) { n = nodes[nnodes]; if (bt_subtrees(bt, n) == bt_max_subtrees(bt)) { nodeptr lptr, rptr; /* Split the node and carry on up. */ bt_xform(bt, NODE_ADD_ELT, childposns[nnodes], n, NULL, element, left, right, bt_min_subtrees(bt), &lptr, &rptr, &element); left = bt_write_unlock(bt, lptr); right = bt_write_unlock(bt, rptr); } else { bt_xform(bt, NODE_ADD_ELT, childposns[nnodes], n, NULL, element, left, right, NODE_JOIN, &n, NULL, NULL); single = bt_write_unlock(bt, n); break; } } /* * If nnodes < 0, we have just split the root and we need to * build a new root node. */ if (nnodes < 0) { bt_new_root(bt, left, right, element); } else { /* * Now nodes[nnodes-1] just wants to have child pointer * child[nnodes-1] replaced by `single', in case the * subtree was moved. Propagate this back up to the root, * unlocking all nodes. */ while (nnodes-- > 0) { bt_set_child(bt, nodes[nnodes], childposns[nnodes], single); single = bt_write_unlock(bt, nodes[nnodes]); } } ifree(nodes); ifree(childposns); } /* * Add an element in sorted order. This is a wrapper on bt_addpos() * which finds the numeric index to add the item at and then calls * addpos. This isn't an optimal use of time, but it saves space by * avoiding starting to clone multiply-linked nodes until it's * known that the item _can_ be added to the tree (and isn't * duplicated in it already). */ bt_element_t bt_add(btree *bt, bt_element_t element) { nodeptr n, n2; int child, is_elt; int pos = 0; n = bt_read_lock_root(bt); while (n) { child = bt_lookup_cmp(bt, n, element, bt->cmp, &is_elt); if (is_elt) { bt_read_unlock(bt, n); return bt_element(bt, n, child); /* element exists already */ } else { pos += bt_child_startpos(bt, n, child); n2 = bt_read_lock_child(bt, n, child); bt_read_unlock(bt, n); n = n2; } } bt_addpos(bt, element, pos); return element; } /* * Delete an element given its numeric position. Returns the * element deleted. */ bt_element_t bt_delpos(btree *bt, int pos) { nodeptr n, c, c2, saved_n; nodeptr *nodes; int nnodes, child, nroot, pos2, ends, st, splitpoint, saved_pos; bt_element_t e, ret; /* * Just like in bt_add, we store the set of nodeptrs we * write-locked on the way down, so we can unlock them on the * way back up. */ nodes = inewn(nodeptr, bt->depth+1); nnodes = 0; n = bt_write_lock_root(bt); nroot = TRUE; saved_n = NULL; if (!n || pos < 0 || pos >= bt_node_count(bt, n)) { if (n) bt_write_unlock(bt, n); return NULL; } while (1) { nodes[nnodes++] = n; /* * Find out which subtree to descend to. */ pos2 = pos; child = bt_lookup_pos(bt, n, &pos, &ends); c = bt_write_lock_child(bt, n, child); if (c && bt_subtrees(bt, c) == bt_min_subtrees(bt)) { /* * We're trying to descend to a subtree that's of * minimum size. Do something! */ if (child > 0) { /* * Either move a subtree from the left sibling, or * merge with it. (Traditionally we would only * merge if we can't move a subtree from _either_ * sibling, but this way avoids too many extra * write locks.) */ c2 = c; c = bt_write_lock_child(bt, n, child-1); e = bt_element(bt, n, child-1); st = bt_subtrees(bt, c); if (st > bt_min_subtrees(bt)) splitpoint = st - 2; else splitpoint = NODE_JOIN; child--; } else { /* * Likewise on the right-hand side. */ c2 = bt_write_lock_child(bt, n, child+1); e = bt_element(bt, n, child); st = bt_subtrees(bt, c2); if (st > bt_min_subtrees(bt)) splitpoint = bt_min_subtrees(bt); else splitpoint = NODE_JOIN; } if (splitpoint == NODE_JOIN) { /* * So if we're merging nodes, go to it... */ bt_xform(bt, NODE_AS_IS, 0, c, c2, e, NODE_ADDR_NULL, NODE_ADDR_NULL, NODE_JOIN, &c, NULL, NULL); bt_xform(bt, NODE_DEL_ELT, child, n, NULL, NULL, bt_node_addr(bt, c), NODE_ADDR_NULL, NODE_JOIN, &n, NULL, NULL); if (nroot && bt_subtrees(bt, n) == 1) { /* * Whoops, we just merged the last two children * of the root. Better relocate the root. */ bt_shift_root(bt, n, bt_node_addr(bt, c)); nnodes--; /* don't leave it in nodes[]! */ n = NULL; bt_write_relock(bt, c, TRUE); } else bt_write_unlock(bt, c); } else { /* * Or if we're redistributing subtrees, go to that. */ bt_xform(bt, NODE_AS_IS, 0, c, c2, e, NODE_ADDR_NULL, NODE_ADDR_NULL, splitpoint, &c, &c2, &e); bt_set_element(bt, n, child, e); bt_write_unlock(bt, c); bt_write_unlock(bt, c2); } if (n) { /* Recompute the counts in n so we can do lookups again. */ bt_write_relock(bt, n, TRUE); /* Having done the transform, redo the position lookup. */ pos = pos2; child = bt_lookup_pos(bt, n, &pos, &ends); c = bt_write_lock_child(bt, n, child); } else { pos = pos2; } } /* * Now see if this node contains the element we're * looking for. */ if (n && (ends & ENDS_RIGHT)) { /* * It does. Element number `child' is the element we * want to delete. See if this is a leaf node... */ if (!bt_is_leaf(bt, n)) { /* * It's not a leaf node. So we save the nodeptr and * element index for later reference, and decrement * `pos' so that we're searching for the element to its * left, which _will_ be in a leaf node. */ saved_n = n; saved_pos = child; pos--; } else { /* * We've reached a leaf node. Check to see if an * internal-node position was stored in saved_n and * saved_pos, and move this element there if so. */ if (saved_n) { ret = bt_element(bt, saved_n, saved_pos); bt_set_element(bt, saved_n, saved_pos, bt_element(bt, n, child)); } else { ret = bt_element(bt, n, child); } /* Then delete it from the leaf node. */ bt_xform(bt, NODE_DEL_ELT, child, n, NULL, NULL, NODE_ADDR_NULL, NODE_ADDR_NULL, NODE_JOIN, &n, NULL, NULL); /* * Final special case: if this is the root node and * we've just deleted its last element, we should * destroy it and leave a completely empty tree. */ if (nroot && bt_subtrees(bt, n) == 1) { bt_shift_root(bt, n, NODE_ADDR_NULL); nnodes--; /* and take it out of nodes[] */ } /* Now we're done */ break; } } /* Descend to the child and go round again. */ n = c; nroot = FALSE; } /* * All done. Zip back up the tree un-write-locking nodes. */ while (nnodes-- > 0) bt_write_unlock(bt, nodes[nnodes]); ifree(nodes); return ret; } /* * Delete an element in sorted order. */ bt_element_t bt_del(btree *bt, bt_element_t element) { int index; if (!bt_findrelpos(bt, element, NULL, BT_REL_EQ, &index)) return NULL; /* wasn't there */ return bt_delpos(bt, index); } /* * Join two trees together, given their respective depths and a * middle element. Puts the resulting tree in the root of `bt'. * * This internal routine assumes that the trees have the same * degree. * * The input nodeptrs are assumed to be write-locked, but none of * their children are yet write-locked. */ static void bt_join_internal(btree *bt, nodeptr lp, nodeptr rp, bt_element_t sep, int ld, int rd) { nodeptr *nodes; int *childposns; int nnodes, nodessize; int lsub, rsub; /* * We will need to store parent nodes up to the difference * between ld and rd. */ nodessize = (ld < rd ? rd-ld : ld-rd); if (nodessize) { /* we may not need _any_! */ nodes = inewn(nodeptr, nodessize); childposns = inewn(int, nodessize); } nnodes = 0; if (ld > rd) { bt->root = bt_node_addr(bt, lp); bt->depth = ld; /* If the left tree is taller, search down its right-hand edge. */ while (ld > rd) { int child = bt_subtrees(bt, lp) - 1; nodeptr n = bt_write_lock_child(bt, lp, child); nodes[nnodes] = lp; childposns[nnodes] = child; nnodes++; lp = n; ld--; } } else { bt->root = bt_node_addr(bt, rp); bt->depth = rd; /* If the right tree is taller, search down its left-hand edge. */ while (rd > ld) { nodeptr n = bt_write_lock_child(bt, rp, 0); nodes[nnodes] = rp; childposns[nnodes] = 0; nnodes++; rp = n; rd--; } } /* * So we now want to combine nodes lp and rp into either one or * two plausibly-sized nodes, whichever is feasible. We have a * joining element `sep'. */ lsub = (lp ? bt_subtrees(bt, lp) : 0); rsub = (rp ? bt_subtrees(bt, rp) : 0); if (lp && rp && lsub + rsub <= bt_max_subtrees(bt)) { node_addr la; /* Join the nodes into one. */ bt_xform(bt, NODE_AS_IS, 0, lp, rp, sep, NODE_ADDR_NULL, NODE_ADDR_NULL, NODE_JOIN, &lp, NULL, NULL); /* Unlock the node. */ la = bt_write_unlock(bt, lp); /* Update the child pointer in the next node up. */ if (nnodes > 0) bt_set_child(bt, nodes[nnodes-1], childposns[nnodes-1], la); else bt->root = la; } else { node_addr la, ra; if (!lp || !rp) { la = NODE_ADDR_NULL; ra = NODE_ADDR_NULL; } else { int lsize, rsize; /* Re-split the nodes into two plausibly sized ones. */ lsize = lsub + rsub; rsize = lsize / 2; lsize -= rsize; bt_xform(bt, NODE_AS_IS, 0, lp, rp, sep, NODE_ADDR_NULL, NODE_ADDR_NULL, lsize-1, &lp, &rp, &sep); /* Unlock the nodes. */ la = bt_write_unlock(bt, lp); ra = bt_write_unlock(bt, rp); } /* * Now we have to do the addition thing: progress up the * tree replacing a single subtree pointer with the * la/sep/ra assembly, until no more nodes have to split as * a result. */ while (nnodes-- > 0) { nodeptr n = nodes[nnodes]; if (bt_subtrees(bt, n) == bt_max_subtrees(bt)) { /* Split the node and carry on up. */ bt_xform(bt, NODE_ADD_ELT, childposns[nnodes], n, NULL, sep, la, ra, bt_min_subtrees(bt), &lp, &rp, &sep); la = bt_write_unlock(bt, lp); ra = bt_write_unlock(bt, rp); } else { bt_xform(bt, NODE_ADD_ELT, childposns[nnodes], n, NULL, sep, la, ra, NODE_JOIN, &n, NULL, NULL); bt_write_unlock(bt, n); break; } } /* * If nnodes < 0, we have just split the root and we need * to build a new root node. */ if (nnodes < 0) bt_new_root(bt, la, ra, sep); } /* * Now we just need to go back up and unlock any remaining * nodes. Also here we ensure the root points where it should. */ while (nnodes-- > 0) { node_addr na; na = bt_write_unlock(bt, nodes[nnodes]); if (nnodes == 0) bt->root = na; } if (nodessize) { ifree(nodes); ifree(childposns); } } /* * External interfaces to the join functionality: join and joinr * (differing only in which B-tree structure they leave without any * elements, and which they return the combined tree in). */ btree *bt_join(btree *bt1, btree *bt2) { nodeptr root1, root2; int size2; size2 = bt_count(bt2); if (size2 > 0) { bt_element_t sep; if (bt1->cmp) { /* * The trees are ordered, so verify the ordering * condition: ensure nothing in bt1 is greater than or * equal to the minimum element in bt2. */ sep = bt_index(bt2, 0); sep = bt_findrelpos(bt1, sep, NULL, BT_REL_GE, NULL); if (sep) return NULL; } sep = bt_delpos(bt2, 0); root1 = bt_write_lock_root(bt1); root2 = bt_write_lock_root(bt2); bt_join_internal(bt1, root1, root2, sep, bt1->depth, bt2->depth); bt2->root = NODE_ADDR_NULL; bt2->depth = 0; } return bt1; } btree *bt_joinr(btree *bt1, btree *bt2) { nodeptr root1, root2; int size1; size1 = bt_count(bt1); if (size1 > 0) { bt_element_t sep; if (bt2->cmp) { /* * The trees are ordered, so verify the ordering * condition: ensure nothing in bt2 is less than or * equal to the maximum element in bt1. */ sep = bt_index(bt1, size1-1); sep = bt_findrelpos(bt2, sep, NULL, BT_REL_LE, NULL); if (sep) return NULL; } sep = bt_delpos(bt1, size1-1); root1 = bt_write_lock_root(bt1); root2 = bt_write_lock_root(bt2); bt_join_internal(bt2, root1, root2, sep, bt1->depth, bt2->depth); bt1->root = NODE_ADDR_NULL; bt1->depth = 0; } return bt2; } /* * Perform the healing process after a tree has been split. `rhs' * is set if the cut edge is the one on the right. */ static void bt_split_heal(btree *bt, int rhs) { nodeptr n; nodeptr *nodes; int nnodes; nodes = inewn(nodeptr, bt->depth); nnodes = 0; n = bt_write_lock_root(bt); /* * First dispense with completely trivial cases: a root node * containing only one subtree can be thrown away instantly. */ while (n && bt_subtrees(bt, n) == 1) { nodeptr n2 = bt_write_lock_child(bt, n, 0); bt_shift_root(bt, n, bt_node_addr(bt, n2)); n = n2; } /* * Now we have a plausible root node. Start going down the cut * edge looking for undersized or minimum nodes, and arranging * for them to be above minimum size. */ while (n) { int edge, next, elt, size_e, size_n, size_total; nodeptr ne, nn, nl, nr; bt_element_t el; nodes[nnodes++] = n; if (rhs) { edge = bt_subtrees(bt, n) - 1; next = edge - 1; elt = next; } else { edge = 0; next = 1; elt = edge; } ne = bt_write_lock_child(bt, n, edge); if (!ne) break; size_e = bt_subtrees(bt, ne); if (size_e <= bt_min_subtrees(bt)) { nn = bt_write_lock_child(bt, n, next); el = bt_element(bt, n, elt); size_n = bt_subtrees(bt, nn); if (edge < next) nl = ne, nr = nn; else nl = nn, nr = ne; size_total = size_e + size_n; if (size_e + size_n <= bt_max_subtrees(bt)) { /* * Merge the edge node and its sibling together. */ bt_xform(bt, NODE_AS_IS, 0, nl, nr, el, NODE_ADDR_NULL, NODE_ADDR_NULL, NODE_JOIN, &ne, NULL, NULL); bt_xform(bt, NODE_DEL_ELT, elt, n, NULL, NULL, bt_node_addr(bt, ne), NODE_ADDR_NULL, NODE_JOIN, &n, NULL, NULL); /* * It's possible we've just trashed the root of the * tree, again. */ if (bt_subtrees(bt, n) == 1) { bt_shift_root(bt, n, bt_node_addr(bt, ne)); nnodes--; /* and take it out of nodes[] */ } } else { /* * Redistribute subtrees between the edge node and * its sibling. */ int split; size_e = (size_total + 1) / 2; assert(size_e > bt_min_subtrees(bt)); if (next < edge) split = size_total - size_e - 1; else split = size_e - 1; bt_xform(bt, NODE_AS_IS, 0, nl, nr, el, NODE_ADDR_NULL, NODE_ADDR_NULL, split, &nl, &nr, &el); bt_write_unlock(bt, nn); bt_set_element(bt, n, elt, el); } } n = ne; } /* * Now we just need to go back up and unlock any remaining * nodes. */ while (nnodes-- > 0) bt_write_unlock(bt, nodes[nnodes]); ifree(nodes); } /* * Split a tree by numeric position. The new tree returned is the * one on the right; the original tree contains the stuff on the * left. */ static btree *bt_split_internal(btree *bt1, int index) { btree *bt2; nodeptr *lnodes, *rnodes; nodeptr n1, n2, n; int nnodes, child; bt2 = bt_new(bt1->cmp, bt1->copy, bt1->freeelt, bt1->propsize, bt1->propalign, bt1->propmake, bt1->propmerge, bt1->userstate, bt1->mindegree); bt2->depth = bt1->depth; lnodes = inewn(nodeptr, bt1->depth); rnodes = inewn(nodeptr, bt2->depth); nnodes = 0; n1 = bt_write_lock_root(bt1); while (n1) { child = bt_lookup_pos(bt1, n1, &index, NULL); n = bt_write_lock_child(bt1, n1, child); bt_xform(bt1, NODE_ADD_ELT, child, n1, NULL, NULL, bt_node_addr(bt1, n), NODE_ADDR_NULL, child, &n1, &n2, NULL); lnodes[nnodes] = n1; rnodes[nnodes] = n2; if (nnodes > 0) bt_set_child(bt2, rnodes[nnodes-1], 0, bt_node_addr(bt2, n2)); else bt2->root = bt_node_addr(bt2, n2); nnodes++; n1 = n; } /* * Now we go back up and unlock all the nodes. At this point we * don't mess with user properties, because there's the danger * of a node containing no subtrees _or_ elements and hence us * having to invent a notation for an empty property. We're * going to make a second healing pass in a moment anyway, * which will sort all that out for us. */ while (nnodes-- > 0) { bt_write_unlock_internal(bt1, lnodes[nnodes], FALSE); bt_write_unlock_internal(bt2, rnodes[nnodes], FALSE); } /* * Then we make a healing pass down each side of the tree. */ bt_split_heal(bt1, TRUE); bt_split_heal(bt2, FALSE); ifree(lnodes); ifree(rnodes); return bt2; } /* * Split a tree at a numeric index. */ btree *bt_splitpos(btree *bt, int index, int before) { btree *ret; node_addr na; int count, nd; nodeptr n; n = bt_read_lock_root(bt); count = (n ? bt_node_count(bt, n) : 0); bt_read_unlock(bt, n); if (index < 0 || index > count) return NULL; ret = bt_split_internal(bt, index); if (before) { na = bt->root; bt->root = ret->root; ret->root = na; nd = bt->depth; bt->depth = ret->depth; ret->depth = nd; } return ret; } /* * Split a tree at a position dictated by the sorting order. */ btree *bt_split(btree *bt, bt_element_t element, cmpfn_t cmp, int rel) { int before, index; assert(rel != BT_REL_EQ); /* has to be an inequality */ if (rel == BT_REL_GT || rel == BT_REL_GE) { before = TRUE; rel = (rel == BT_REL_GT ? BT_REL_LE : BT_REL_LT); } else { before = FALSE; } if (!bt_findrelpos(bt, element, cmp, rel, &index)) index = -1; return bt_splitpos(bt, index+1, before); } #ifdef TEST #define TEST_DEGREE 4 #define BT_COPY bt_clone #define MAXTREESIZE 10000 #define MAXLOCKS 100 int errors; /* * Error reporting function. */ void error(char *fmt, ...) { va_list ap; fprintf(stderr, "ERROR: "); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\n"); errors++; } /* * See if a tree has a 2-element root node. */ static int bt_tworoot(btree *bt) { nodeptr n; int i; n = bt_read_lock_root(bt); i = bt_subtrees(bt, n); bt_read_unlock(bt, n); return (i == 2 ? TRUE : FALSE); } /* * Physically copy an entire B-tree. (NB this appears as a test * routine rather than a production one, since reference counting * and bt_clone() provide a better way to do this for real code. If * anyone really needs a genuine physical copy for anything other * than testing reasons, I suppose they could always lift this into * the admin section above.) */ static nodeptr bt_copy_node(btree *bt, nodeptr n) { int i, children; nodeptr ret; children = bt_subtrees(bt, n); ret = bt_new_node(bt, children); for (i = 0; i < children; i++) { nodeptr n2 = bt_read_lock_child(bt, n, i); nodeptr n3; if (n2) { n3 = bt_copy_node(bt, n2); bt_set_child(bt, ret, i, bt_write_unlock(bt, n3)); } else { bt_set_child(bt, ret, i, NODE_ADDR_NULL); } bt_read_unlock(bt, n2); if (i < children-1) { bt_element_t e = bt_element(bt, n, i); if (bt->copy) e = bt->copy(bt->userstate, e); bt_set_element(bt, ret, i, e); } } return ret; } btree *bt_copy(btree *bt) { nodeptr n; btree *bt2; bt2 = bt_new(bt->cmp, bt->copy, bt->freeelt, bt->propsize, bt->propalign, bt->propmake, bt->propmerge, bt->userstate, bt->mindegree); bt2->depth = bt->depth; n = bt_read_lock_root(bt); if (n) bt2->root = bt_write_unlock(bt2, bt_copy_node(bt, n)); bt_read_unlock(bt, n); return bt2; } /* * This function is intended to be called from gdb when debugging * things. */ void bt_dump_nodes(btree *bt, ...) { int i, children; va_list ap; nodeptr n; va_start(ap, bt); while (1) { n = va_arg(ap, nodeptr); if (!n) break; printf("%p [%d]:", n, n[bt->maxdegree*2+1].i); children = bt_subtrees(bt, n); for (i = 0; i < children; i++) { printf(" %p", bt_child(bt, n, i).p); if (i < children-1) printf(" %s", (char *)bt_element(bt, n, i)); } printf("\n"); } va_end(ap); } /* * Verify a tree against an array. Checks that: * * - every node has a valid number of subtrees * - subtrees are either all present (internal node) or all absent * (leaf) * - elements are all present * - every leaf is at exactly the depth claimed by the tree * - the tree represents the correct list of elements in the * correct order. (This also tests the ordering constraint, * assuming the array is correctly constructed.) */ void verifynode(btree *bt, nodeptr n, bt_element_t *array, int *arraypos, int depth) { int subtrees, min, max, i, before, after, count; /* Check the subtree count. The root can have as few as 2 subtrees. */ subtrees = bt_subtrees(bt, n); max = bt_max_subtrees(bt); min = (depth == 1) ? 2 : bt_min_subtrees(bt); if (subtrees > max) error("node %p has too many subtrees (%d > %d)", n, subtrees, max); if (subtrees < min) error("node %p has too few subtrees (%d < %d)", n, subtrees, min); /* Check that subtrees are present or absent as required. */ for (i = 0; i < subtrees; i++) { node_addr child = bt_child(bt, n, i); if (depth == bt->depth && child.p != NULL) error("leaf node %p child %d is %p not NULL\n", n, i, child); if (depth != bt->depth && child.p == NULL) error("non-leaf node %p child %d is NULL\n", n, i); } /* Check that elements are all present. */ for (i = 0; i < subtrees-1; i++) { bt_element_t elt = bt_element(bt, n, i); if (elt == NULL) error("node %p element %d is NULL\n", n, i); } before = *arraypos; /* Now verify the subtrees, and simultaneously check the ordering. */ for (i = 0; i < subtrees; i++) { if (depth < bt->depth) { nodeptr child = bt_read_lock_child(bt, n, i); verifynode(bt, child, array, arraypos, depth+1); bt_read_unlock(bt, child); } if (i < subtrees-1) { bt_element_t elt = bt_element(bt, n, i); if (array[*arraypos] != elt) { error("node %p element %d is \"%s\", but array[%d]=\"%s\"", n, i, elt, *arraypos, array[*arraypos]); } (*arraypos)++; } } after = *arraypos; /* Check the node count. */ count = bt_node_count(bt, n); if (count != after - before) error("node %p count is %d, should be %d", n, count, after - before); /* * Check the user properties. */ { nodecomponent *prop; int i; int max = 0, total = 0; prop = n + bt->maxdegree * 2 + 2; for (i = before; i < after; i++) { int c = (unsigned char)*(char *)array[i]; if (max < c) max = c; total += c; } if (prop[0].i != total) error("node %p total prop is %d, should be %d", n, prop[0].i, total); if (prop[1].i != max) error("node %p max prop is %d, should be %d", n, prop[1].i, max); } } void verifytree(btree *bt, bt_element_t *array, int arraylen) { nodeptr n; int i = 0; n = bt_read_lock_root(bt); if (n) { verifynode(bt, n, array, &i, 1); bt_read_unlock(bt, n); } else { if (bt->depth != 0) { error("tree has null root but depth is %d not zero", bt->depth); } } if (i != arraylen) error("tree contains %d elements, array contains %d", i, arraylen); testlock(-1, 0, NULL); } int mycmp(void *state, void *av, void *bv) { char const *a = (char const *)av; char const *b = (char const *)bv; return strcmp(a, b); } static void set_invalid_property(void *propv) { int *prop = (int *)propv; prop[0] = prop[1] = -1; } void mypropmake(void *state, void *av, void *destv) { char const *a = (char const *)av; int *dest = (int *)destv; dest[0] = dest[1] = (unsigned char)*a; } void mypropmerge(void *state, void *s1v, void *s2v, void *destv) { int *s1 = (int *)s1v; int *s2 = (int *)s2v; int *dest = (int *)destv; if (!s1v && !s2v) { /* Special `destroy' case. */ set_invalid_property(destv); return; } assert(s2[0] >= 0 && s2[1] >= 0); assert(s1 == NULL || (s1[0] >= 0 && s1[1] >= 0)); dest[0] = s2[0] + (s1 ? s1[0] : 0); dest[1] = (s1 && s1[1] > s2[1] ? s1[1] : s2[1]); } void array_addpos(bt_element_t *array, int *arraylen, bt_element_t e, int i) { bt_element_t e2; int len = *arraylen; assert(len < MAXTREESIZE); while (i < len) { e2 = array[i]; array[i] = e; e = e2; i++; } array[len] = e; *arraylen = len+1; } void array_add(bt_element_t *array, int *arraylen, bt_element_t e) { int i; int len = *arraylen; for (i = 0; i < len; i++) if (mycmp(NULL, array[i], e) >= 0) break; assert(i == len || mycmp(NULL, array[i], e) != 0); array_addpos(array, arraylen, e, i); } void array_delpos(bt_element_t *array, int *arraylen, int i) { int len = *arraylen; while (i < len-1) { array[i] = array[i+1]; i++; } *arraylen = len-1; } bt_element_t array_del(bt_element_t *array, int *arraylen, bt_element_t e) { int i; int len = *arraylen; bt_element_t ret; for (i = 0; i < len; i++) if (mycmp(NULL, array[i], e) >= 0) break; if (i < len && mycmp(NULL, array[i], e) == 0) { ret = array[i]; array_delpos(array, arraylen, i); } else ret = NULL; return ret; } /* A sample data set and test utility. Designed for pseudo-randomness, * and yet repeatability. */ /* * This random number generator uses the `portable implementation' * given in ANSI C99 draft N869. It assumes `unsigned' is 32 bits; * change it if not. */ int randomnumber(unsigned *seed) { *seed *= 1103515245; *seed += 12345; return ((*seed) / 65536) % 32768; } #define lenof(x) ( sizeof((x)) / sizeof(*(x)) ) char *strings[] = { "0", "2", "3", "I", "K", "d", "H", "J", "Q", "N", "n", "q", "j", "i", "7", "G", "F", "D", "b", "x", "g", "B", "e", "v", "V", "T", "f", "E", "S", "8", "A", "k", "X", "p", "C", "R", "a", "o", "r", "O", "Z", "u", "6", "1", "w", "L", "P", "M", "c", "U", "h", "9", "t", "5", "W", "Y", "m", "s", "l", "4", }; #define NSTR lenof(strings) void findtest(btree *tree, bt_element_t *array, int arraylen) { static const int rels[] = { BT_REL_EQ, BT_REL_GE, BT_REL_LE, BT_REL_LT, BT_REL_GT }; static const char *const relnames[] = { "EQ", "GE", "LE", "LT", "GT" }; int i, j, rel, index; char *p, *ret, *realret, *realret2; int lo, hi, mid, c; for (i = 0; i < (int)NSTR; i++) { p = strings[i]; for (j = 0; j < (int)(sizeof(rels)/sizeof(*rels)); j++) { rel = rels[j]; lo = 0; hi = arraylen-1; while (lo <= hi) { mid = (lo + hi) / 2; c = strcmp(p, array[mid]); if (c < 0) hi = mid-1; else if (c > 0) lo = mid+1; else break; } if (c == 0) { if (rel == BT_REL_LT) ret = (mid > 0 ? array[--mid] : NULL); else if (rel == BT_REL_GT) ret = (mid < arraylen-1 ? array[++mid] : NULL); else ret = array[mid]; } else { assert(lo == hi+1); if (rel == BT_REL_LT || rel == BT_REL_LE) { mid = hi; ret = (hi >= 0 ? array[hi] : NULL); } else if (rel == BT_REL_GT || rel == BT_REL_GE) { mid = lo; ret = (lo < arraylen ? array[lo] : NULL); } else ret = NULL; } realret = bt_findrelpos(tree, p, NULL, rel, &index); testlock(-1, 0, NULL); if (realret != ret) { error("find(\"%s\",%s) gave %s should be %s", p, relnames[j], realret, ret); } if (realret && index != mid) { error("find(\"%s\",%s) gave %d should be %d", p, relnames[j], index, mid); } if (realret && rel == BT_REL_EQ) { realret2 = bt_index(tree, index); if (realret2 != realret) { error("find(\"%s\",%s) gave %s(%d) but %d -> %s", p, relnames[j], realret, index, index, realret2); } } } } realret = bt_findrelpos(tree, NULL, NULL, BT_REL_GT, &index); testlock(-1, 0, NULL); if (arraylen && (realret != array[0] || index != 0)) { error("find(NULL,GT) gave %s(%d) should be %s(0)", realret, index, array[0]); } else if (!arraylen && (realret != NULL)) { error("find(NULL,GT) gave %s(%d) should be NULL", realret, index); } realret = bt_findrelpos(tree, NULL, NULL, BT_REL_LT, &index); testlock(-1, 0, NULL); if (arraylen && (realret != array[arraylen-1] || index != arraylen-1)) { error("find(NULL,LT) gave %s(%d) should be %s(0)", realret, index, array[arraylen-1]); } else if (!arraylen && (realret != NULL)) { error("find(NULL,LT) gave %s(%d) should be NULL", realret, index); } } void splittest(btree *tree, bt_element_t *array, int arraylen) { int i; btree *tree3, *tree4; for (i = 0; i <= arraylen; i++) { printf("splittest: %d\n", i); tree3 = BT_COPY(tree); testlock(-1, 0, NULL); tree4 = bt_splitpos(tree3, i, 0); testlock(-1, 0, NULL); verifytree(tree3, array, i); verifytree(tree4, array+i, arraylen-i); bt_join(tree3, tree4); testlock(-1, 0, NULL); verifytree(tree4, NULL, 0); bt_free(tree4); /* left empty by join */ testlock(-1, 0, NULL); verifytree(tree3, array, arraylen); bt_free(tree3); testlock(-1, 0, NULL); } } /* * Called to track read and write locks on nodes. */ void testlock(int write, int set, nodeptr n) { static nodeptr readlocks[MAXLOCKS], writelocks[MAXLOCKS]; static int nreadlocks = 0, nwritelocks = 0; int i, rp, wp; if (write == -1) { /* Called after an operation to ensure all locks are unlocked. */ if (nreadlocks != 0 || nwritelocks != 0) error("at least one left-behind lock exists!"); return; } /* Locking NULL does nothing. Unlocking it is an error. */ if (n == NULL) { if (!set) error("attempting to %s-unlock NULL", write ? "write" : "read"); return; } assert(nreadlocks < MAXLOCKS && nwritelocks < MAXLOCKS); /* First look for the node in both lock lists. */ rp = wp = -1; for (i = 0; i < nreadlocks; i++) if (readlocks[i] == n) rp = i; for (i = 0; i < nwritelocks; i++) if (writelocks[i] == n) wp = i; /* Now diverge based on what we're supposed to be up to. */ if (set) { /* Setting a lock. Should not already be locked in either list. */ if (rp != -1 || wp != -1) { error("attempt to %s-lock node %p, already %s-locked", (write ? "write" : "read"), n, (rp==-1 ? "write" : "read")); } if (write) writelocks[nwritelocks++] = n; else readlocks[nreadlocks++] = n; } else { /* Clearing a lock. Should exist in exactly the correct list. */ if (write && rp != -1) error("attempt to write-unlock node %p which is read-locked", n); if (!write && wp != -1) error("attempt to read-unlock node %p which is write-locked", n); if (wp != -1) { nwritelocks--; for (i = wp; i < nwritelocks; i++) writelocks[i] = writelocks[i+1]; } if (rp != -1) { nreadlocks--; for (i = rp; i < nreadlocks; i++) readlocks[i] = readlocks[i+1]; } } } int main(void) { int in[NSTR]; int i, j, k; int tworoot, tmplen; unsigned seed = 0; bt_element_t *array; int arraylen; bt_element_t ret, ret2, item; btree *tree, *tree2, *tree3, *tree4; setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); errors = 0; for (i = 0; i < (int)NSTR; i++) in[i] = 0; array = newn(bt_element_t, MAXTREESIZE); arraylen = 0; tree = bt_new(mycmp, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); verifytree(tree, array, arraylen); for (i = 0; i < 10000; i++) { j = randomnumber(&seed); j %= NSTR; printf("trial: %d\n", i); if (in[j]) { printf("deleting %s (%d)\n", strings[j], j); ret2 = array_del(array, &arraylen, strings[j]); ret = bt_del(tree, strings[j]); testlock(-1, 0, NULL); assert((bt_element_t)strings[j] == ret && ret == ret2); verifytree(tree, array, arraylen); in[j] = 0; } else { printf("adding %s (%d)\n", strings[j], j); array_add(array, &arraylen, strings[j]); ret = bt_add(tree, strings[j]); testlock(-1, 0, NULL); assert(strings[j] == ret); verifytree(tree, array, arraylen); in[j] = 1; } /* disptree(tree); */ findtest(tree, array, arraylen); } while (arraylen > 0) { j = randomnumber(&seed); j %= arraylen; item = array[j]; ret2 = array_del(array, &arraylen, item); ret = bt_del(tree, item); testlock(-1, 0, NULL); assert(ret2 == ret); verifytree(tree, array, arraylen); } bt_free(tree); testlock(-1, 0, NULL); /* * Now try an unsorted tree. We don't really need to test * delpos because we know del is based on it, so it's already * been tested in the above sorted-tree code; but for * completeness we'll use it to tear down our unsorted tree * once we've built it. */ tree = bt_new(NULL, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); verifytree(tree, array, arraylen); for (i = 0; i < 1000; i++) { printf("trial: %d\n", i); j = randomnumber(&seed); j %= NSTR; k = randomnumber(&seed); k %= bt_count(tree)+1; testlock(-1, 0, NULL); printf("adding string %s at index %d\n", strings[j], k); array_addpos(array, &arraylen, strings[j], k); bt_addpos(tree, strings[j], k); testlock(-1, 0, NULL); verifytree(tree, array, arraylen); } /* * While we have this tree in its full form, we'll take a copy * of it to use in split and join testing. */ tree2 = BT_COPY(tree); testlock(-1, 0, NULL); verifytree(tree2, array, arraylen);/* check the copy is accurate */ /* * Split tests. Split the tree at every possible point and * check the resulting subtrees. */ tworoot = bt_tworoot(tree2); /* see if it has a 2-root */ testlock(-1, 0, NULL); splittest(tree2, array, arraylen); /* * Now do the split test again, but on a tree that has a 2-root * (if the previous one didn't) or doesn't (if the previous one * did). */ tmplen = arraylen; while (bt_tworoot(tree2) == tworoot) { bt_delpos(tree2, --tmplen); testlock(-1, 0, NULL); } printf("now trying splits on second tree\n"); splittest(tree2, array, tmplen); bt_free(tree2); testlock(-1, 0, NULL); /* * Back to the main testing of uncounted trees. */ while (bt_count(tree) > 0) { printf("cleanup: tree size %d\n", bt_count(tree)); j = randomnumber(&seed); j %= bt_count(tree); printf("deleting string %s from index %d\n", (char *)array[j], j); ret = bt_delpos(tree, j); testlock(-1, 0, NULL); assert((bt_element_t)array[j] == ret); array_delpos(array, &arraylen, j); verifytree(tree, array, arraylen); } bt_free(tree); testlock(-1, 0, NULL); /* * Finally, do some testing on split/join on _sorted_ trees. At * the same time, we'll be testing split on very small trees. */ tree = bt_new(mycmp, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); arraylen = 0; for (i = 0; i < 16; i++) { array_add(array, &arraylen, strings[i]); ret = bt_add(tree, strings[i]); testlock(-1, 0, NULL); assert(strings[i] == ret); verifytree(tree, array, arraylen); tree2 = BT_COPY(tree); splittest(tree2, array, arraylen); testlock(-1, 0, NULL); bt_free(tree2); testlock(-1, 0, NULL); } bt_free(tree); testlock(-1, 0, NULL); /* * Test silly cases of join: join(emptytree, emptytree), and * also ensure join correctly spots when sorted trees fail the * ordering constraint. */ tree = bt_new(mycmp, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); tree2 = bt_new(mycmp, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); tree3 = bt_new(mycmp, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); tree4 = bt_new(mycmp, NULL, NULL, 2*sizeof(int), alignof(int), mypropmake, mypropmerge, NULL, TEST_DEGREE); assert(mycmp(NULL, strings[0], strings[1]) < 0); /* just in case :-) */ bt_add(tree2, strings[1]); testlock(-1, 0, NULL); bt_add(tree4, strings[0]); testlock(-1, 0, NULL); array[0] = strings[0]; array[1] = strings[1]; verifytree(tree, array, 0); verifytree(tree2, array+1, 1); verifytree(tree3, array, 0); verifytree(tree4, array, 1); /* * So: * - join(tree,tree3) should leave both tree and tree3 unchanged. * - joinr(tree,tree2) should leave both tree and tree2 unchanged. * - join(tree4,tree3) should leave both tree3 and tree4 unchanged. * - join(tree, tree2) should move the element from tree2 to tree. * - joinr(tree4, tree3) should move the element from tree4 to tree3. * - join(tree,tree3) should return NULL and leave both unchanged. * - join(tree3,tree) should work and create a bigger tree in tree3. */ assert(tree == bt_join(tree, tree3)); testlock(-1, 0, NULL); verifytree(tree, array, 0); verifytree(tree3, array, 0); assert(tree2 == bt_joinr(tree, tree2)); testlock(-1, 0, NULL); verifytree(tree, array, 0); verifytree(tree2, array+1, 1); assert(tree4 == bt_join(tree4, tree3)); testlock(-1, 0, NULL); verifytree(tree3, array, 0); verifytree(tree4, array, 1); assert(tree == bt_join(tree, tree2)); testlock(-1, 0, NULL); verifytree(tree, array+1, 1); verifytree(tree2, array, 0); assert(tree3 == bt_joinr(tree4, tree3)); testlock(-1, 0, NULL); verifytree(tree3, array, 1); verifytree(tree4, array, 0); assert(NULL == bt_join(tree, tree3)); testlock(-1, 0, NULL); verifytree(tree, array+1, 1); verifytree(tree3, array, 1); assert(tree3 == bt_join(tree3, tree)); testlock(-1, 0, NULL); verifytree(tree3, array, 2); verifytree(tree, array, 0); bt_free(tree); testlock(-1, 0, NULL); bt_free(tree2); testlock(-1, 0, NULL); bt_free(tree3); testlock(-1, 0, NULL); bt_free(tree4); testlock(-1, 0, NULL); sfree(array); if (errors) fprintf(stderr, "%d errors!\n", errors); return (errors != 0 ? 1 : 0); } #endif tweak-3.01/buffer.c0000644000175300017530000003061710433027777014510 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include #include #include #include "btree.h" #ifdef TEST_BUFFER #define BLKMIN 4 #else #define BLKMIN 512 #endif #define BLKMAX (2*BLKMIN) struct file { FILE *fp; int refcount; }; struct buffer { btree *bt; }; struct bufblk { fileoffset_t len; /* number of bytes in block, always */ struct file *file; /* non-NULL indicates a file block */ fileoffset_t filepos; /* only meaningful if fp!=NULL */ unsigned char *data; /* only used if fp==NULL */ }; static bt_element_t bufblkcopy(void *state, void *av) { struct bufblk *a = (struct bufblk *)av; struct bufblk *ret; if (a->file) { ret = (struct bufblk *)malloc(sizeof(struct bufblk)); ret->data = NULL; a->file->refcount++; } else { ret = (struct bufblk *)malloc(sizeof(struct bufblk) + BLKMAX); ret->data = (unsigned char *)(ret+1); memcpy(ret->data, a->data, BLKMAX); } ret->file = a->file; ret->filepos = a->filepos; ret->len = a->len; return ret; } static void bufblkfree(void *state, void *av) { struct bufblk *a = (struct bufblk *)av; if (a->file) { a->file->refcount--; if (a->file->refcount == 0) { fclose(a->file->fp); free(a->file); } } free(a); } void bufblkpropmake(void *state, bt_element_t av, void *destv) { struct bufblk *a = (struct bufblk *)av; fileoffset_t *dest = (fileoffset_t *)destv; *dest = a->len; } /* s1 may be NULL (indicating copy s2 into dest). s2 is never NULL. */ void bufblkpropmerge(void *state, void *s1v, void *s2v, void *destv) { fileoffset_t *s1 = (fileoffset_t *)s1v; fileoffset_t *s2 = (fileoffset_t *)s2v; fileoffset_t *dest = (fileoffset_t *)destv; if (!s1 && !s2) return; /* don't need to free anything */ *dest = *s2 + (s1 ? *s1 : 0); } static buffer *buf_new_from_bt(btree *bt) { buffer *buf = (buffer *)malloc(sizeof(buffer)); buf->bt = bt; return buf; } static btree *buf_bt_new(void) { return bt_new(NULL, bufblkcopy, bufblkfree, sizeof(fileoffset_t), alignof(fileoffset_t), bufblkpropmake, bufblkpropmerge, NULL, 2); } extern void buf_free(buffer *buf) { bt_free(buf->bt); free(buf); } static int bufblksearch(void *tstate, void *sstate, int ntrees, void **props, int *counts, bt_element_t *elts, int *is_elt) { fileoffset_t *disttogo = (fileoffset_t *)sstate; fileoffset_t distsofar = 0; int i; for (i = 0; i < ntrees; i++) { struct bufblk *blk; fileoffset_t sublen = props[i] ? *(fileoffset_t *)props[i] : 0; if ((props[i] && *disttogo < distsofar + sublen) || (*disttogo == distsofar + sublen && i == ntrees-1)) { *disttogo -= distsofar; /* * Descend into this subtree. */ *is_elt = FALSE; return i; } distsofar += sublen; if (i < ntrees-1) { blk = (struct bufblk *)elts[i]; if (*disttogo < distsofar + blk->len) { /* * Select this element. */ *disttogo -= distsofar; *is_elt = TRUE; return i; } distsofar += blk->len; } } assert(!"We should never reach here"); return 0; /* placate gcc */ } static int buf_bt_find_pos(btree *bt, fileoffset_t pos, fileoffset_t *poswithin) { int index; bt_propfind(bt, bufblksearch, &pos, &index); *poswithin = pos; return index; } /* * Convert a file-data block of size at most BUFMAX into a * literal-data block. Returns the replacement block (the old one * still needs freeing) or NULL if no conversion performed. */ static struct bufblk *buf_convert_to_literal(struct bufblk *blk) { if (blk->file && blk->len <= BLKMAX) { struct bufblk *ret = (struct bufblk *)malloc(sizeof(struct bufblk) + BLKMAX); ret->data = (unsigned char *)(ret+1); ret->file = NULL; ret->filepos = 0; ret->len = blk->len; fseeko(blk->file->fp, blk->filepos, SEEK_SET); fread(ret->data, blk->len, 1, blk->file->fp); return ret; } return NULL; } /* * Look at blocks `index' and `index+1' of buf. If they're both * literal-data blocks and one of them is undersized, merge or * redistribute. Returns 0 if it has not changed the number of * blocks, or 1 if it has merged two. */ static int buf_bt_cleanup(btree *bt, int index) { struct bufblk *a, *b, *cvt; fileoffset_t totallen; unsigned char tmpdata[BLKMAX*2]; if (index < 0) return 0; a = (struct bufblk *)bt_index(bt, index); b = (struct bufblk *)bt_index(bt, index+1); if ( a && (cvt = buf_convert_to_literal(a)) != NULL ) { bt_replace(bt, cvt, index); bufblkfree(NULL, a); a = cvt; } if ( b && (cvt = buf_convert_to_literal(b)) != NULL ) { bt_replace(bt, cvt, index+1); bufblkfree(NULL, b); b = cvt; } if (!a || !b || a->file || b->file) return 0; if (a->len >= BLKMIN && b->len >= BLKMIN) return 0; assert(a->len <= BLKMAX && b->len <= BLKMAX); /* Use bt_index_w to ensure reference count of 1 on both blocks */ a = (struct bufblk *)bt_index_w(bt, index); b = (struct bufblk *)bt_index_w(bt, index+1); /* * So, we have one block with size at most BLKMIN, and another * with size at most BLKMAX. Combined, their maximum possible * size is in excess of BLKMAX, so we can't guaranteeably merge * them into one. If they won't merge, we instead redistribute * data between them. */ totallen = a->len + b->len; memcpy(tmpdata, a->data, a->len); memcpy(tmpdata + a->len, b->data, b->len); if (totallen >= BLKMAX) { /* * Redistribute into two (nearly) equal-sized blocks. */ a->len = totallen / 2; b->len = totallen - a->len; memcpy(a->data, tmpdata, a->len); memcpy(b->data, tmpdata + a->len, b->len); bt_replace(bt, a, index); bt_replace(bt, b, index+1); return 0; } else { /* * Just merge into one. */ a->len = totallen; memcpy(a->data, tmpdata, a->len); bt_replace(bt, a, index); free(bt_delpos(bt, index+1)); return 1; } } static int buf_bt_splitpoint(btree *bt, fileoffset_t pos) { fileoffset_t poswithin; int index; struct bufblk *blk, *newblk; index = buf_bt_find_pos(bt, pos, &poswithin); if (!poswithin) return index; /* the nice simple case */ /* * Now split element `index' at position `poswithin'. */ blk = (struct bufblk *)bt_index_w(bt, index); /* ensure ref count == 1 */ newblk = (struct bufblk *)bufblkcopy(NULL, blk); if (!newblk->file) { memcpy(newblk->data, blk->data + poswithin, blk->len - poswithin); } else { newblk->filepos += poswithin; } blk->len = poswithin; bt_replace(bt, blk, index); newblk->len -= poswithin; bt_addpos(bt, newblk, index+1); buf_bt_cleanup(bt, index+1); index -= buf_bt_cleanup(bt, index-1); return index + 1; } static btree *buf_bt_split(btree *bt, fileoffset_t pos, int before) { int index = buf_bt_splitpoint(bt, pos); return bt_splitpos(bt, index, before); } static btree *buf_bt_join(btree *a, btree *b) { int index = bt_count(a) - 1; btree *ret; ret = bt_join(a, b); buf_bt_cleanup(ret, index); return ret; } static void buf_insert_bt(buffer *buf, btree *bt, fileoffset_t pos) { btree *right = buf_bt_split(buf->bt, pos, FALSE); buf->bt = buf_bt_join(buf->bt, bt); buf->bt = buf_bt_join(buf->bt, right); } static int bufblklensearch(void *tstate, void *sstate, int ntrees, void **props, int *counts, bt_element_t *elts, int *is_elt) { fileoffset_t *output = (fileoffset_t *)sstate; fileoffset_t size = 0; int i; for (i = 0; i < ntrees; i++) { struct bufblk *blk; if (props[i]) size += *(fileoffset_t *)props[i]; if (i < ntrees-1) { blk = (struct bufblk *)elts[i]; size += blk->len; } } *output = size; /* Actual return value doesn't matter */ *is_elt = TRUE; return 1; } static fileoffset_t buf_bt_length(btree *bt) { fileoffset_t length; bt_propfind(bt, bufblklensearch, &length, NULL); return length; } extern fileoffset_t buf_length(buffer *buf) { return buf_bt_length(buf->bt); } extern buffer *buf_new_empty(void) { buffer *buf = (buffer *)malloc(sizeof(buffer)); buf->bt = buf_bt_new(); return buf; } extern buffer *buf_new_from_file(FILE *fp) { buffer *buf = buf_new_empty(); struct bufblk *blk; struct file *file; file = (struct file *)malloc(sizeof(struct file)); file->fp = fp; file->refcount = 1; /* the reference we're about to make */ blk = (struct bufblk *)malloc(sizeof(struct bufblk)); blk->data = NULL; blk->file = file; blk->filepos = 0; fseeko(fp, 0, SEEK_END); blk->len = ftello(fp); bt_addpos(buf->bt, blk, 0); buf_bt_cleanup(buf->bt, 0); return buf; } extern void buf_fetch_data(buffer *buf, void *vdata, int len, fileoffset_t pos) { int index; fileoffset_t poswithin; fileoffset_t thislen; unsigned char *data = (unsigned char *)vdata; index = buf_bt_find_pos(buf->bt, pos, &poswithin); while (len > 0) { struct bufblk *blk = (struct bufblk *)bt_index(buf->bt, index); thislen = blk->len - poswithin; if (thislen > len) thislen = len; if (blk->file) { fseeko(blk->file->fp, blk->filepos + poswithin, SEEK_SET); fread(data, thislen, 1, blk->file->fp); } else { memcpy(data, blk->data + poswithin, thislen); } data += thislen; len -= thislen; poswithin = 0; index++; } } extern void buf_insert_data(buffer *buf, void *vdata, int len, fileoffset_t pos) { btree *bt = buf_bt_new(); int nblocks, blklen1, extra; int i, origlen = len; unsigned char *data = (unsigned char *)vdata; nblocks = len / ((BLKMIN + BLKMAX)/2); if (nblocks * BLKMAX < len) nblocks++; blklen1 = len / nblocks; extra = len % nblocks; assert(blklen1 >= BLKMIN || nblocks == 1); assert(blklen1 <= BLKMAX - (extra!=0)); for (i = 0; i < nblocks; i++) { struct bufblk *blk; int blklen = blklen1 + (i < extra); blk = (struct bufblk *)malloc(sizeof(struct bufblk) + BLKMAX); blk->data = (unsigned char *)(blk+1); memcpy(blk->data, data, blklen); blk->len = blklen; blk->file = NULL; blk->filepos = 0; data += blklen; len -= blklen; bt_addpos(bt, blk, i); assert(origlen == buf_bt_length(bt) + len); } assert(len == 0); assert(origlen == buf_bt_length(bt)); buf_insert_bt(buf, bt, pos); } extern void buf_delete(buffer *buf, fileoffset_t len, fileoffset_t pos) { btree *left = buf_bt_split(buf->bt, pos, TRUE); btree *right = buf_bt_split(buf->bt, len, FALSE); bt_free(buf->bt); buf->bt = buf_bt_join(left, right); } extern void buf_overwrite_data(buffer *buf, void *data, int len, fileoffset_t pos) { buf_delete(buf, len, pos); buf_insert_data(buf, data, len, pos); } extern buffer *buf_cut(buffer *buf, fileoffset_t len, fileoffset_t pos) { btree *left = buf_bt_split(buf->bt, pos, TRUE); btree *right = buf_bt_split(buf->bt, len, FALSE); btree *ret = buf->bt; buf->bt = buf_bt_join(left, right); return buf_new_from_bt(ret); } extern buffer *buf_copy(buffer *buf, fileoffset_t len, fileoffset_t pos) { btree *left = buf_bt_split(buf->bt, pos, TRUE); btree *right = buf_bt_split(buf->bt, len, FALSE); btree *ret = bt_clone(buf->bt); buf->bt = buf_bt_join(left, buf->bt); buf->bt = buf_bt_join(buf->bt, right); return buf_new_from_bt(ret); } extern void buf_paste(buffer *buf, buffer *cutbuffer, fileoffset_t pos) { btree *bt = bt_clone(cutbuffer->bt); buf_insert_bt(buf, bt, pos); } #ifdef TEST_BUFFER static FILE *debugfp = NULL; extern void buffer_diagnostic(buffer *buf, char *title) { int i; fileoffset_t offset; struct bufblk *blk; if (!debugfp) { debugfp = fdopen(3, "w"); if (!debugfp) debugfp = fopen("debug.log", "w"); } if (!buf) { fprintf(debugfp, "Buffer [%s] is null\n", title); return; } fprintf(debugfp, "Listing of buffer [%s]:\n", title); offset = 0; for (i = 0; (blk = (struct bufblk *)bt_index(buf->bt, i)) != NULL; i++) { fprintf(debugfp, "%016"OFF"x: %p, len =%8"OFF"d,", offset, blk, blk->len); if (blk->file) { fprintf(debugfp, " file %p pos %8"OFF"d\n", blk->file, blk->filepos); } else { int j; for (j = 0; j < blk->len; j++) fprintf(debugfp, " %02x", blk->data[j]); fprintf(debugfp, "\n"); } offset += blk->len; } fprintf(debugfp, "Listing concluded\n\n"); fflush(debugfp); } #endif tweak-3.01/curses.c0000644000175300017530000000470010433027777014535 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include #include #include #include int display_rows, display_cols; void display_beep(void) { beep(); } static void get_screen_size (void) { getmaxyx(stdscr, display_rows, display_cols); } void display_setup(void) { initscr(); noecho(); keypad(stdscr, 0); raw(); move(0,0); refresh(); get_screen_size(); if (has_colors()) { start_color(); use_default_colors(); } } void display_cleanup(void) { endwin(); } void display_moveto(int y, int x) { wmove(stdscr, y, x); } void display_refresh(void) { refresh(); } void display_write_str(char *str) { waddstr(stdscr, str); } void display_write_chars(char *str, int len) { waddnstr(stdscr, str, len); } #define MAXCOLOURS 32 int attrs[MAXCOLOURS]; void display_define_colour(int colour, int fg, int bg, int reverse) { static int colours[8] = { COLOR_BLACK, COLOR_RED, COLOR_GREEN, COLOR_YELLOW, COLOR_BLUE, COLOR_MAGENTA, COLOR_CYAN, COLOR_WHITE, }; if (fg < 0 && bg < 0) { attrs[colour] = 0; } else { assert(colour >= 0 && colour < MAXCOLOURS); assert(!(bg & ~7)); /* bold backgrounds are nonportable */ if (colour < COLOR_PAIRS-2) { init_pair(colour+1, colours[fg & 7], colours[bg]); attrs[colour] = (fg & 8 ? A_BOLD : 0) | COLOR_PAIR(colour+1); } else { /* can't allocate a colour pair, so we just use b&w attrs */ attrs[colour] = (fg & 8 ? A_BOLD : 0) | (reverse ? A_REVERSE : 0); } } } void display_set_colour(int colour) { wattrset(stdscr, attrs[colour]); } void display_clear_to_eol(void) { wclrtoeol(stdscr); } int last_getch = ERR; int display_getkey(void) { int ret; extern void schedule_update(void); if (last_getch != ERR) { int ret = last_getch; last_getch = ERR; return ret; } while (1) { ret = getch(); if (ret == KEY_RESIZE) { schedule_update(); continue; } return ret; } } int display_input_to_flush(void) { int ret; if (last_getch != ERR) return TRUE; nodelay(stdscr, 1); ret = getch(); nodelay(stdscr, 0); if (ret == ERR) return FALSE; last_getch = ret; return TRUE; } void display_post_error(void) { /* I don't _think_ we need do anything here */ } void display_recheck_size(void) { get_screen_size (); } tweak-3.01/keytab.c0000644000175300017530000000504510433027777014513 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include #include typedef union keytab keytab; union keytab { enum {ACTION, EXTENDED} type; struct { int type; keyact action; } a; struct { int type; keytab *extended[256]; } e; }; keytab *base[256] = { NULL256 }; /* * Bind a key sequence to an action. */ void bind_key (char *sequence, int len, keyact action) { keytab *(*table)[256]; int k, i; table = &base; while (--len) { k = (unsigned char) *sequence++; if ( !(*table)[k] ) { /* * We must create an EXTENDED entry. */ (*table)[k] = malloc(sizeof(base[0]->e)); (*table)[k]->type = EXTENDED; for (i=0; i<256; i++) (*table)[k]->e.extended[i] = NULL; } else if ( (*table)[k]->type == ACTION ) { /* * A subsequence is already bound: fail. */ return; } table = &(*table)[k]->e.extended; } k = (unsigned char) *sequence; if ( !(*table)[k] ) { /* * We can bind the key. */ (*table)[k] = malloc(sizeof(base[0]->a)); (*table)[k]->type = ACTION; (*table)[k]->a.action = action; } } /* * Format an ASCII code into a printable description of the key stroke. */ static void strkey (char *s, int k) { k &= 255; /* force unsigned */ if (k==27) strcpy(s, " ESC"); else if (k<32 || k==127) sprintf(s, " ^%c", k ^ 64); else if (k<127) sprintf(s, " %c", k); else sprintf(s, " <0x%2X>", k); } /* * Get and process a key stroke. */ void proc_key (void) { keytab *kt; #if defined(unix) && !defined(GO32) if (update_required) update(); safe_update = TRUE; #endif last_char = display_getkey(); #if defined(unix) && !defined(GO32) safe_update = FALSE; #endif strcpy(message, "Unknown key sequence"); strkey(message+strlen(message), last_char); kt = base[(unsigned char) last_char]; if (!kt) { display_beep(); while (display_input_to_flush()) strkey(message+strlen(message), display_getkey()); return; } while (kt->type == EXTENDED) { #if defined(unix) && !defined(GO32) if (update_required) update(); safe_update = TRUE; #endif last_char = display_getkey(); #if defined(unix) && !defined(GO32) safe_update = FALSE; #endif strkey(message+strlen(message), last_char); kt = kt->e.extended[(unsigned char) last_char]; if (!kt) { display_beep(); while (display_input_to_flush()) strkey(message+strlen(message), display_getkey()); return; } } message[0] = '\0'; /* clear the "unknown" message */ (*kt->a.action)(); } tweak-3.01/main.c0000644000175300017530000005670710433027777014173 0ustar simonsimon00000000000000/* * Potential future TODO items. Points marked ISSUE need to be * resolved one way or another, with good justification for the * decision made, before implementation begins. * * - Multiple buffers, multiple on-screen windows. * + ^X^F to open new file * + ^X^R to open new file RO * + ^X b to switch buffers in a window * + ^X o to switch windows * + ^X 2 to split a window * + ^X 1 to destroy all windows but this * + ^X 0 to destroy this window * + ^X ^ to enlarge this window by one line * + width settings vary per buffer (aha, _that's_ why I wanted * a buffer structure surrounding the raw B-tree) * + hex-editor-style minibuffer for entering search terms, * rather than the current rather crap one; in particular * this enables pasting into the search string. * + ISSUE: how exactly do we deal with the problem of saving * over a file which we're maintaining references to in * another buffer? The _current_ buffer can at least be * sorted out by replacing it with a fresh tree containing a * single file-data block, but other buffers are in trouble. * * if we can rely on Unix fd semantics, one option is just * to keep the fd open on the original file, and then the * data stays around even after we rename(2) our new * version over the top. Disk space usage gets silly after * a few iterations, but it's better than nothing. * * - Undo! * + this actually doesn't seem _too_ horrid. For a start, one * simple approach would be to clone the entire buffer B-tree * every time we perform an operation! That's actually not * _too_ expensive, if we maintain a limit on the number of * operations we may undo. * + I had also thought of cloning the tree we insert for each * buf_insert_data and cloning the one removed for each * buf_delete_data (both must be cloned for an overwrite), * but I'm not convinced that simply cloning the entire thing * isn't a superior option. * + this really starts to show up the distinction between a * `buffer' and a bare tree. A buffer is something which has * an undo chain attached; so, in particular, the cut buffer * shouldn't be one. Sort that out. * * - In-place editing. * + this is an extra option useful for editing disk devices * directly (!), or other situation in which it's impossible * or impractical to rename(2) your new file over the old * one. It causes a change of semantics when saving: instead * of constructing a new backup file and writing it over the * old one, we simply seek within the original file and write * out all the pieces that have changed. * + Saving the file involves identifying the bits of the file * that need to change, and changing them. A piece of file * can be discarded as `no change required' if it's * represented in the buffer by a from-file block whose file * offset is equal to its offset in the buffer. * * Once we have identified all the bits that do need to * change, we have to draw up a dependency graph to * indicate which bits want to be copied from which other * bits. (You don't want to overwrite a piece of file if * you still have from-file blocks pointing at that * piece.) This is a directed graph with nodes * corresponding to intervals of the file, and edges * indicating that the source node's interval is intended * to end up containing the data from the target node's * interval in the original file. Another node type is * `literal data', which can be the target of an edge but * never the source. * - note that this means any two nodes connected by an * edge must represent intervals of the same length. * Sometimes this means that an interval must be split * into pieces even though it is represented in the * buffer by a single large from-file block (if * from-file blocks copying _from_ it don't cover the * whole of it). I suspect the simplest approach here * is just to start by making a B-tree of division * points in the file: every from-file block adds four * division points (for start and end of both source * and dest interval), and once the tree is complete, * each graph node represents the interval between two * adjacent division points. * - ISSUE: actually, that strategy is inadequate: * consider a large from-file block displaced by only * one byte from its source location. The above * strategy gives division points at x, x+1, x+y, * x+y+1, but the interval [x,x+1] actually wants to * point to [x+1,x+2] and we don't have a division * point for that. Worse still, finding a way to add * the remaining division points is also undesirable * because there'd be so many of them. Needs design * changes. * * Then, any node which is not the target of any edge * represents a piece of file which it's safe to write * over, so we do so and throw away the node. * * If we run out of such nodes and the graph is still * non-empty, it's because all remaining nodes are part of * loops. A loop must represent a set of disjoint * intervals in the file, all the same length, which need * to be permuted cyclically. So we deal with such a loop * by reading a chunk of data from the start of one of the * intervals and holding it, then copying from the next * interval to that one, and so on until we've gone round * the loop. * + the intervals in the loop might be far too big to * hold an entire interval's worth of real data in * memory, so we might have to do it piecewise. * + ISSUE: I wonder if a warning of some sort might be in * order for if you accidentally request most of the file be * moved about. This sort of trickery is really intended for * small changes to a large file; if you (say) enable insert * mode while editing a hard disk and accidentally leave * everything one byte further up, you _really_ don't want to * hit Save. The semantics of the warning are difficult, * though. * * - Custom display and/or input formats? * + for example, Zap on RISC OS is able to display a binary * file at 4 bytes per line and show the ARM disassembly of * each word. For added credit, ability to type an ARM * instruction back _in_ and have it reassembled into binary * would be even better. * + a simpler example is that sometimes you want to view a * file as a sequence of little-endian 32-bit words rather * than single bytes. * + this would have to involve some sort of scripting or * internal API. I'd really rather the interface was nailed * down very early on and people were then free to develop * custom formats without my involvement; I might be * persuaded to keep a library of them or a list of * hyperlinks or something, but actually _maintaining_ them * is more effort than I want. * + ARM assembler is all very well, but what about x86, with * its variable instruction length? You can start * disassembling from any byte position and work forwards * unambiguously, but going backwards or jumping to an * arbitrary byte position is much harder. You might have to * shift your current file view back or forward by one byte * to resynchronise, and the semantics of insert mode become * generally confused, and even trying to _predict_ what a * sensible synchronisation point would be when jumping to a * bit of the file you've never seen before ... yuck. * * The key thing that makes this horrid is that the custom * display mode looks at the file _contents_, not merely * its length, when deciding how many bytes per line to * display. File-position-dependent number of bytes per * line is fine, but _data_ dependency is doom. * * So I think that in the interests of not causing tension * between random things people would like in _some_ hex * editor and what makes Tweak Tweak, I am going to put my * foot down and say that I will not implement any * mechanism which permits a data-dependent number of * bytes per line. Anything short of that, fine, send me a * patch or a detailed and well thought out design and * I'll consider it on its merits. * * I don't, OTOH, see any reason why a custom display * function couldn't be permitted to see data before or * after the current lineful if it wanted to. So x86 * disassembly could be done in a one-byte-per-line sort * of fashion in which each line shows the machine * instruction which the CPU would see if it started * executing at that byte, and also gave its length. Then * you could pick out the sequence of instructions you * were interested in from the various out-of-sync ones. */ #include "tweak.h" #include #include #include #include #include #if defined(unix) && !defined(GO32) #include #include #include #elif defined(MSDOS) #include #include #endif static void init(void); static void done(void); static void load_file (char *); char toprint[256]; /* LUT: printable versions of chars */ char hex[256][3]; /* LUT: binary to hex, 1 byte */ char message[80]; char decstatus[] = "%s TWEAK "VER": %-18.18s %s posn=%-10"OFF"d size=%-10"OFF"d"; char hexstatus[] = "%s TWEAK "VER": %-18.18s %s posn=0x%-8"OFF"X size=0x%-8"OFF"X"; char *statfmt = hexstatus; char last_char; char *pname; char *filename = NULL; buffer *filedata, *cutbuffer = NULL; int fix_mode = FALSE; int look_mode = FALSE; int eager_mode = FALSE; int insert_mode = FALSE; int edit_type = 1; /* 1,2 are hex digits, 0=ascii */ int finished = FALSE; int marking = FALSE; int modified = FALSE; int new_file = FALSE; /* shouldn't need initialisation - * but let's not take chances :-) */ fileoffset_t width = 16; fileoffset_t realoffset = 0, offset = 16; int ascii_enabled = TRUE; fileoffset_t file_size = 0, top_pos = 0, cur_pos = 0, mark_point = 0; int scrlines; /* * Main program */ int main(int argc, char **argv) { fileoffset_t newoffset = -1, newwidth = -1; /* * Parse command line arguments */ pname = *argv; /* program name */ if (argc < 2) { fprintf(stderr, "usage: %s [-f] [-l] [-e] filename\n" " or %s -D to write default tweak.rc to stdout\n", pname, pname); return 0; } while (--argc > 0) { char c, *p = *++argv, *value; if (*p == '-') { p++; while (*p) switch (c = *p++) { case 'o': case 'O': case 'w': case 'W': /* * these parameters require arguments */ if (*p) value = p, p = ""; else if (--argc) value = *++argv; else { fprintf(stderr, "%s: option `-%c' requires an argument\n", pname, c); return 1; } switch (c) { case 'o': case 'O': newoffset = parse_num(value, NULL); break; case 'w': case 'W': newwidth = parse_num(value, NULL); break; } break; case 'f': case 'F': fix_mode = TRUE; break; case 'l': case 'L': look_mode = TRUE; break; case 'e': case 'E': eager_mode = TRUE; break; case 'D': write_default_rc(); return 0; break; } } else { if (filename) { fprintf(stderr, "%s: multiple filenames specified\n", pname); return 1; } filename = p; } } if (!filename) { fprintf(stderr, "%s: no filename specified\n", pname); return 1; } read_rc(); if (newoffset != -1) realoffset = newoffset; if (newwidth != -1) width = newwidth; load_file (filename); init(); fix_offset(); do { draw_scr (); proc_key (); } while (!finished); done(); return 0; } /* * Fix up `offset' to match `realoffset'. Also, while we're here, * enable or disable ASCII mode and sanity-check the width. */ void fix_offset(void) { if (3*width+11 > display_cols) { width = (display_cols-11) / 3; sprintf (message, "Width reduced to %"OFF"d to fit on the screen", width); } if (4*width+14 > display_cols) { ascii_enabled = FALSE; if (edit_type == 0) edit_type = 1; /* force to hex mode */ } else ascii_enabled = TRUE; offset = realoffset % width; if (!offset) offset = width; } /* * Initialise stuff at the beginning of the program: mostly the * display. */ static void init(void) { int i; display_setup(); display_define_colour(COL_BUFFER, -1, -1, FALSE); display_define_colour(COL_SELECT, 0, 7, TRUE); display_define_colour(COL_STATUS, 11, 4, TRUE); display_define_colour(COL_ESCAPE, 9, 0, FALSE); display_define_colour(COL_INVALID, 11, 0, FALSE); for (i=0; i<256; i++) { sprintf(hex[i], "%02X", i); toprint[i] = (i>=32 && i<127 ? i : '.'); } } /* * Clean up all the stuff that init() did. */ static void done(void) { display_cleanup(); } /* * Load the file specified on the command line. */ static void load_file (char *fname) { FILE *fp; file_size = 0; if ( (fp = fopen (fname, "rb")) ) { if (eager_mode) { size_t len; static char buffer[4096]; filedata = buf_new_empty(); file_size = 0; /* * We've opened the file. Load it. */ while ( (len = fread (buffer, 1, sizeof(buffer), fp)) > 0 ) { buf_insert_data (filedata, buffer, len, file_size); file_size += len; } fclose (fp); assert(file_size == buf_length(filedata)); sprintf(message, "loaded %s (size %"OFF"d == 0x%"OFF"X).", fname, file_size, file_size); } else { filedata = buf_new_from_file(fp); file_size = buf_length(filedata); sprintf(message, "opened %s (size %"OFF"d == 0x%"OFF"X).", fname, file_size, file_size); } new_file = FALSE; } else { if (look_mode || fix_mode) { fprintf(stderr, "%s: file %s not found, and %s mode active\n", pname, fname, (look_mode ? "LOOK" : "FIX")); exit (1); } filedata = buf_new_empty(); sprintf(message, "New file %s.", fname); new_file = TRUE; } } /* * Save the file. Return TRUE on success, FALSE on error. */ int save_file (void) { FILE *fp; fileoffset_t pos = 0; if (look_mode) return FALSE; /* do nothing! */ if ( (fp = fopen (filename, "wb")) ) { static char buffer[SAVE_BLKSIZ]; while (pos < file_size) { fileoffset_t size = file_size - pos; if (size > SAVE_BLKSIZ) size = SAVE_BLKSIZ; buf_fetch_data (filedata, buffer, size, pos); if (size != fwrite (buffer, 1, size, fp)) { fclose (fp); return FALSE; } pos += size; } } else return FALSE; fclose (fp); return TRUE; } /* * Make a backup of the file, if such has not already been done. * Return TRUE on success, FALSE on error. */ int backup_file (void) { char backup_name[FILENAME_MAX]; if (new_file) return TRUE; /* unnecessary - pretend it's done */ strcpy (backup_name, filename); #if defined(unix) && !defined(GO32) strcat (backup_name, ".bak"); #elif defined(MSDOS) { char *p, *q; q = NULL; for (p = backup_name; *p; p++) { if (*p == '\\') q = NULL; else if (*p == '.') q = p; } if (!q) q = p; strcpy (q, ".BAK"); } #endif remove (backup_name); /* don't care if this fails */ return !rename (filename, backup_name); } static unsigned char *scrbuf = NULL; static int scrbuflines = 0; /* * Draw the screen, for normal usage. */ void draw_scr (void) { int scrsize, scroff, llen, i, j; fileoffset_t currpos; fileoffset_t marktop, markbot; int mark; char *p; unsigned char c, *q; char *linebuf; scrlines = display_rows - 2; if (scrlines > scrbuflines) { scrbuf = (scrbuf ? realloc(scrbuf, scrlines*width) : malloc(scrlines*width)); if (!scrbuf) { done(); fprintf(stderr, "%s: out of memory!\n", pname); exit (2); } scrbuflines = scrlines; } linebuf = malloc(width*4+20); if (!linebuf) { done(); fprintf(stderr, "%s: out of memory!\n", pname); exit (2); } memset (linebuf, ' ', width*4+13); linebuf[width*4+13] = '\0'; if (top_pos == 0) scroff = width - offset; else scroff = 0; scrsize = scrlines * width - scroff; if (scrsize > file_size - top_pos) scrsize = file_size - top_pos; buf_fetch_data (filedata, scrbuf, scrsize, top_pos); scrsize += scroff; /* hack but it'll work */ mark = marking && (cur_pos != mark_point); if (mark) { if (cur_pos > mark_point) marktop = mark_point, markbot = cur_pos; else marktop = cur_pos, markbot = mark_point; } else marktop = markbot = 0; /* placate gcc */ currpos = top_pos; q = scrbuf; for (i=0; i> 24) & 0xFF]; linebuf[0]=p[0]; linebuf[1]=p[1]; p = hex[(currpos >> 16) & 0xFF]; linebuf[2]=p[0]; linebuf[3]=p[1]; p = hex[(currpos >> 8) & 0xFF]; linebuf[4]=p[0]; linebuf[5]=p[1]; p = hex[currpos & 0xFF]; linebuf[6]=p[0]; linebuf[7]=p[1]; for (j=0; j 0) { if (currpos == 0 && j < width-offset) p = " ", c = ' '; else p = hex[*q], c = *q++; scrsize--; } else { p = " ", c = ' '; } linebuf[11+3*j]=p[0]; linebuf[12+3*j]=p[1]; linebuf[13+3*width+j]=toprint[c]; } llen = (currpos ? width : offset); if (mark && currposmarktop) { /* * Some of this line is marked. Maybe all. Whatever * the precise details, there will be two regions * requiring highlighting: a hex bit and an ascii * bit. */ fileoffset_t localstart= (currposmarkbot ? markbot : currpos+llen) - currpos; localstart += width-llen; localstop += width-llen; display_write_chars(linebuf, 11+3*localstart); display_set_colour(COL_SELECT); display_write_chars(linebuf+11+3*localstart, 3*(localstop-localstart)-1); display_set_colour(COL_BUFFER); if (ascii_enabled) { display_write_chars(linebuf+10+3*localstop, 3+3*width+localstart-3*localstop); display_set_colour(COL_SELECT); display_write_chars(linebuf+13+3*width+localstart, localstop-localstart); display_set_colour(COL_BUFFER); display_write_chars(linebuf+13+3*width+localstop, width-localstop); } else { display_write_chars(linebuf+10+3*localstop, 2+3*width-3*localstop); } } else { display_set_colour(COL_BUFFER); display_write_chars(linebuf, ascii_enabled ? 13+4*width : 10+3*width); } } currpos += (currpos ? width : offset); display_clear_to_eol(); } { char status[80]; int slen; display_moveto (display_rows-2, 0); display_set_colour(COL_STATUS); sprintf(status, statfmt, (modified ? "**" : " "), filename, (insert_mode ? "(Insert)" : look_mode ? "(LOOK) " : fix_mode ? "(FIX) " : "(Ovrwrt)"), cur_pos, file_size); slen = strlen(status); if (slen > display_cols) slen = display_cols; display_write_chars(status, slen); while (slen++ < display_cols) display_write_str(" "); display_set_colour(COL_BUFFER); } display_moveto (display_rows-1, 0); display_write_str (message); display_clear_to_eol(); message[0] = '\0'; i = cur_pos - top_pos; if (top_pos == 0) i += width - offset; j = (edit_type ? (i%width)*3+10+edit_type : (i%width)+13+3*width); if (j >= display_cols) j = display_cols-1; free (linebuf); display_moveto (i/width, j); display_refresh (); } volatile int safe_update, update_required; void update (void); /* * Get a string, in the "minibuffer". Return TRUE on success, FALSE * on break. Possibly syntax-highlight the entered string for * backslash-escapes, depending on the "highlight" parameter. */ int get_str (char *prompt, char *buf, int highlight) { int maxlen = 79 - strlen(prompt); /* limit to 80 - who cares? :) */ int len = 0; int c; for (EVER) { display_moveto (display_rows-1, 0); display_set_colour (COL_MINIBUF); display_write_str (prompt); if (highlight) { char *q, *p = buf, *r = buf+len; while (p=r || !isxdigit ((unsigned char)*p)) display_set_colour(COL_INVALID); else if (p+1>=r || !isxdigit ((unsigned char)p[1])) p++, display_set_colour(COL_INVALID); else p+=2, display_set_colour(COL_ESCAPE); } else { while (p= 32 && c <= 126) { if (len < maxlen) buf[len++] = c; else display_beep(); } if ((c == 127 || c == 8) && len > 0) len--; if (c == 'U'-'@') /* ^U kill line */ len = 0; } } /* * Take a buffer containing possible backslash-escapes, and return * a buffer containing a (binary!) string. Since the string is * binary, it cannot be null terminated: hence the length is * returned from the function. The string is processed in place. * * Escapes are simple: a backslash followed by two hex digits * represents that character; a doubled backslash represents a * backslash itself; a backslash followed by anything else is * invalid. (-1 is returned if an invalid sequence is detected.) */ int parse_quoted (char *buffer) { char *p, *q; p = q = buffer; while (*p) { while (*p && *p != '\\') *q++ = *p++; if (*p == '\\') { p++; if (*p == '\\') *q++ = *p++; else if (p[1] && isxdigit((unsigned char)*p) && isxdigit((unsigned char)p[1])) { char buf[3]; buf[0] = *p++; buf[1] = *p++; buf[2] = '\0'; *q++ = strtol(buf, NULL, 16); } else return -1; } } return q - buffer; } /* * Suspend program. (Or shell out, depending on OS, of course.) */ void suspend(void) { #if defined(unix) && !defined(GO32) done(); raise (SIGTSTP); init(); #elif defined(MSDOS) done(); spawnl (P_WAIT, getenv("COMSPEC"), "", NULL); init(); #else display_beep(); strcpy(message, "Suspend function not yet implemented."); #endif } void update (void) { display_recheck_size(); fix_offset (); draw_scr (); } void schedule_update(void) { if (safe_update) update(); else update_required = TRUE; } fileoffset_t parse_num (char *buffer, int *error) { if (error) *error = FALSE; if (!buffer[strspn(buffer, "0123456789")]) { /* interpret as decimal */ return ATOOFF(buffer); } else if (buffer[0]=='0' && (buffer[1]=='X' || buffer[1]=='x') && !buffer[2+strspn(buffer+2,"0123456789ABCDEFabcdef")]) { return STRTOOFF(buffer+2, NULL, 16); } else if (buffer[0]=='$' && !buffer[1+strspn(buffer+1,"0123456789ABCDEFabcdef")]) { return STRTOOFF(buffer+1, NULL, 16); } else { return 0; if (error) *error = TRUE; } } tweak-3.01/rcfile.c0000644000175300017530000001672710433027777014511 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include #include #if defined(unix) && !defined(GO32) #define RCNAME ".tweakrc" #elif defined(MSDOS) #define RCNAME "tweak.rc" #endif static char *default_rc[] = { "# Default "RCNAME" generated by `tweak -D'.", "#", "# Key bindings: movement keys", "bind top-of-file ^[<", #if defined(unix) && !defined(GO32) "bind page-up ^[[5~", #elif defined(MSDOS) "bind page-up ^@I", "bind page-up ^@/", #endif "bind page-up ^[V", "bind page-up ^[v", "bind move-up ^P", #if defined(unix) && !defined(GO32) "bind move-up ^[[A", #elif defined(MSDOS) "bind move-up ^@H", #endif "bind begin-line ^A", #if defined(unix) && !defined(GO32) "bind begin-line ^[[H", "bind begin-line ^[[1~", #elif defined(MSDOS) "bind begin-line ^@G", #endif "bind move-left ^B", #if defined(unix) && !defined(GO32) "bind move-left ^[[D", #elif defined(MSDOS) "bind move-left ^@K", #endif "bind move-right ^F", #if defined(unix) && !defined(GO32) "bind move-right ^[[C", #elif defined(MSDOS) "bind move-right ^@M", #endif "bind end-line ^E", #if defined(unix) && !defined(GO32) "bind end-line ^[Ow", "bind end-line ^[[4~", #elif defined(MSDOS) "bind end-line ^@O", #endif "bind move-down ^N", #if defined(unix) && !defined(GO32) "bind move-down ^[[B", #elif defined(MSDOS) "bind move-down ^@P", #endif "bind page-down ^V", #if defined(unix) && !defined(GO32) "bind page-down ^[[6~", #elif defined(MSDOS) "bind page-down ^@Q", #endif "bind bottom-of-file ^[>", "", "# Key bindings: miscellaneous editing keys", "bind toggle-insert ^X^I", #if defined(unix) && !defined(GO32) "bind toggle-insert ^[[2~", #elif defined(MSDOS) "bind toggle-insert ^@R", #endif "bind change-mode ^M", "bind change-mode ^J", "bind quote-next ^Q", "bind toggle-status ^XH", "bind toggle-status ^Xh", "bind toggle-status ^XX", "bind toggle-status ^Xx", "", "# Key bindings: deletion keys", "bind delete-left ^?", "bind delete-left ^H", "bind delete-right ^D", #if defined(unix) && !defined(GO32) "bind delete-right ^[[3~", #elif defined(MSDOS) "bind delete-right ^@S", #endif "", "# Key bindings: cut and paste keys", #if defined(unix) && !defined(GO32) "bind mark-place ^@", #elif defined(MSDOS) "bind mark-place ^@^C", #endif "bind cut ^W", "bind copy ^[W", "bind copy ^[w", #ifdef MSDOS "bind copy ^@^Q", #endif "bind paste ^Y", "", "# Key bindings: additional movement keys", "bind search ^S", "bind search-back ^R", "bind goto-position ^XG", "bind goto-position ^Xg", "bind screen-recentre ^L", "", "# Standard screen size parameters, plus keybindings to alter them", "width 16", "offset 0", "bind new-width ^XW", "bind new-width ^Xw", "bind new-offset ^XO", "bind new-offset ^Xo", "", "# Key bindings: overall program/file control", "bind suspend ^Z", "bind exit ^X^C", "bind save-file ^X^S", "# unbound by default: exit-and-save", "", #ifdef TEST_BUFFER "bind diagnostics ^X^D", "", #endif "# End of default "RCNAME, NULL }; extern char *pname; void read_rc (void) { FILE *fp; char **p, *q, *r, *s, *keyseq; char rcbuffer[256]; char rcname[FILENAME_MAX]; int lineno = 0; int errors = FALSE, errors_here; #if defined(unix) && !defined(GO32) rcname[0] = '\0'; if (getenv("HOME")) strcpy (rcname, getenv("HOME")); strcat (rcname, "/.tweakrc"); #elif defined(MSDOS) /* * Use environment variable TWEAKRC if set. Otherwise, look for * TWEAK.RC in the same directory as TWEAK.EXE, if _that_ exists, * and failing everything else, try C:\TWEAK\TWEAK.RC. */ if (getenv("TWEAKRC")) strcpy (rcname, getenv("TWEAKRC")); else { if ( (q = strrchr(pname, '\\')) != NULL) { FILE *tempfp; strncpy (rcname, pname, q+1-pname); strcpy (rcname+(q+1-pname), "TWEAK.RC"); if ( (tempfp = fopen(rcname, "r")) != NULL) fclose (tempfp); else strcpy (rcname, "C:\\TWEAK\\TWEAK.RC"); } else strcpy (rcname, "C:\\TWEAK\\TWEAK.RC"); } #endif { /* easy keybindings: self inserts */ int i; char c; for (i=32; i<127; i++) { c = i; bind_key (&c, 1, act_self_ins); } } fp = fopen(rcname, "r"); p = default_rc; for (EVER) { if (fp) { if (!fgets(rcbuffer, sizeof(rcbuffer), fp)) { fclose (fp); break; } rcbuffer[strcspn(rcbuffer, "\r\n")] = '\0'; } else { if (!*p) break; strcpy (rcbuffer, *p++); } lineno++; errors_here = FALSE; /* * Now we have a line from the .rc file, wherever it's * really come from. Process it. */ q = rcbuffer; while (*q && isspace((unsigned char)*q)) q++; if (!*q || *q == '#') continue; /* comment or blank line */ r = q; while (*r && !isspace((unsigned char)*r)) r++; if (*r) *r++ = '\0'; /* * Now "q" points to the command word, "r" to the rest of * the line. */ if (!strcmp(q, "bind")) { /* * It's a "bind" directive. The rest of the line should * consist of an action name, then a single whitespace * character, then a key sequence. */ keyact action; while (*r && isspace((unsigned char)*r)) r++; q = r; while (*q && !isspace((unsigned char)*q)) q++; if (*q) *q++ = '\0'; else { fprintf(stderr, "%s: no key sequence after \"bind\" command" " on line %d of "RCNAME, pname, lineno); errors = TRUE; continue; } /* * "r" points to the action name; "q" to the key sequence. */ keyseq = s = q; while (*q) { if (*q == '^') { if (!*++q) { fprintf(stderr, "%s: nothing follows `^' on line %d" " of "RCNAME, pname, lineno); errors = TRUE; errors_here = TRUE; } else { *s++ = *q++ ^ 0x40; } } else if (*q == '\\') { if (!*++q) { fprintf(stderr, "%s: nothing follows `\\' on line %d" " of "RCNAME, pname, lineno); errors = TRUE; errors_here = TRUE; } else if (*q == '\\' || *q == '^') { *s++ = *q++; } else if (isxdigit((unsigned char)*q) && q[1] && isxdigit((unsigned char)q[1])) { char buf[3]; buf[0] = *q++; buf[1] = *q++; buf[2] = '\0'; *s++ = strtol (buf, NULL, 16); } else { fprintf(stderr, "%s: badly formed `\\' sequence on" " line %d of "RCNAME, pname, lineno); errors = TRUE; errors_here = TRUE; } } else *s++ = *q++; } if (errors_here) continue; if (!strcmp(r, "quote-next")) { /* * The "quote next" sequence requires special * treatment. */ int i; for (i=0; i<256; i++) { *s = i; bind_key (keyseq, s-keyseq+1, act_self_ins); } } else if ( (action = parse_action (r)) ) { /* * An ordinary action, requiring ordinary treatment. */ bind_key (keyseq, s-keyseq, action); } else { fprintf(stderr, "%s: unrecognised key action \"%s\"" " at line %d of "RCNAME"\n", pname, r, lineno); errors = TRUE; } } else if (!strcmp(q, "width")) { width = atoi(r); } else if (!strcmp(q, "offset")) { realoffset = atoi(r); } else { fprintf(stderr, "%s: unrecognised "RCNAME" directive \"%s\"" " at line %d of "RCNAME"\n", pname, q, lineno); errors = TRUE; } } if (errors) exit(1); } void write_default_rc (void) { char **p; for (p = default_rc; *p; p++) puts (*p); } tweak-3.01/search.c0000644000175300017530000000171110433027777014475 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include static DFA build_dfa (char *pattern, int len) { int i, j, k, b; char *tmp = malloc(len); DFA dfa = malloc(len * sizeof(*dfa)); if (!dfa) return NULL; if (!tmp) return NULL; memcpy (tmp, pattern, len); for (i=len; i-- ;) { j = i+1; for (b=0; b<256; b++) { dfa[i][b] = 0; if (memchr(pattern, b, len)) { tmp[j-1] = b; for (k=1; k<=j; k++) if (!memcmp(tmp+j-k, pattern, k)) dfa[i][b] = k; } } } return dfa; } Search *build_search(char *pattern, int len) { Search *ret = malloc(sizeof(Search)); char *revpat = malloc(len); int i; ret->len = len; ret->forward = build_dfa(pattern, len); for (i = 0; i < len; i++) revpat[i] = pattern[len-1-i]; ret->reverse = build_dfa(revpat, len); return ret; } void free_search(Search *s) { free(s->forward); free(s->reverse); free(s); } tweak-3.01/slang.c0000644000175300017530000000601210433027777014333 0ustar simonsimon00000000000000#include "tweak.h" #include #include #include #include #include #if defined(unix) && !defined(GO32) #include #include #endif #include #if defined(unix) && !defined(GO32) static int sigwinch (int sigtype) { extern void schedule_update(void); schedule_update(); signal (SIGWINCH, (void *) sigwinch); return 0; } #endif int display_rows, display_cols; void display_beep(void) { SLtt_beep(); } static void get_screen_size (void) { int r = 0, c = 0; #ifdef TIOCGWINSZ struct winsize wind_struct; if ((ioctl(1,TIOCGWINSZ,&wind_struct) == 0) || (ioctl(0, TIOCGWINSZ, &wind_struct) == 0) || (ioctl(2, TIOCGWINSZ, &wind_struct) == 0)) { c = (int) wind_struct.ws_col; r = (int) wind_struct.ws_row; } #elif defined(MSDOS) union REGS regs; regs.h.ah = 0x0F; int86 (0x10, ®s, ®s); c = regs.h.ah; regs.x.ax = 0x1130, regs.h.bh = 0; int86 (0x10, ®s, ®s); r = regs.h.dl + 1; #endif if ((r <= 0) || (r > 200)) r = 24; if ((c <= 0) || (c > 250)) c = 80; display_rows = SLtt_Screen_Rows = r; display_cols = SLtt_Screen_Cols = c; } void display_setup(void) { SLtt_get_terminfo(); if (SLang_init_tty (ABORT, 1, 0) == -1) { fprintf(stderr, "tweak: SLang_init_tty: returned error code\n"); exit (1); } SLang_set_abort_signal (NULL); SLtt_Use_Ansi_Colors = TRUE; get_screen_size (); if (SLsmg_init_smg () < 0) { fprintf(stderr, "tweak: SLsmg_init_smg: returned error code\n"); SLang_reset_tty (); exit (1); } #if defined(unix) && !defined(GO32) signal (SIGWINCH, (void *) sigwinch); #endif } void display_cleanup(void) { SLsmg_reset_smg (); SLang_reset_tty (); } void display_moveto(int y, int x) { SLsmg_gotorc(y, x); } void display_refresh(void) { SLsmg_refresh(); } void display_write_str(char *str) { SLsmg_write_nchars(str, strlen(str)); } void display_write_chars(char *str, int len) { SLsmg_write_nchars(str, len); } void display_define_colour(int colour, int fg, int bg, int reverse) { static char *colours[16] = { "black", "red", "green", "brown", "blue", "magenta", "cyan", "lightgray", "gray", "brightred", "brightgreen", "yellow", "brightblue", "brightmagenta", "brightcyan", "white", }; char cname[40]; if (fg < 0 && bg < 0) { /* FIXME: not sure how to support terminal default fg+bg */ fg = 7; bg = 0; } sprintf(cname, "colour%d", colour); SLtt_set_color(colour, cname, colours[fg], colours[bg]); } void display_set_colour(int colour) { SLsmg_set_color(colour); } void display_clear_to_eol(void) { SLsmg_erase_eol(); } int display_getkey(void) { return SLang_getkey(); } int display_input_to_flush(void) { return SLang_input_pending(0); } void display_post_error(void) { SLKeyBoard_Quit = 0; SLang_Error = 0; } void display_recheck_size(void) { SLsmg_reset_smg (); get_screen_size (); SLsmg_init_smg (); } tweak-3.01/btree.h0000644000175300017530000000646310147367204014341 0ustar simonsimon00000000000000/* * Flexible B-tree implementation. Supports reference counting for * copy-on-write, user-defined node properties, and variable * degree. * * This file is copyright 2001,2004 Simon Tatham. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL SIMON TATHAM BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef BTREE_H #define BTREE_H #include /* for offsetof */ #ifndef alignof #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) ) #endif typedef struct btree btree; typedef void *bt_element_t; typedef int (*cmpfn_t)(void *state, bt_element_t, bt_element_t); typedef bt_element_t (*copyfn_t)(void *state, bt_element_t); typedef void (*freefn_t)(void *state, bt_element_t); typedef void (*propmakefn_t)(void *state, bt_element_t, void *dest); /* s1 may be NULL (indicating copy s2 into dest). s2 is never NULL. */ typedef void (*propmergefn_t)(void *state, void *s1, void *s2, void *dest); typedef int (*searchfn_t)(void *tstate, void *sstate, int ntrees, void **props, int *counts, bt_element_t *elts, int *is_elt); enum { BT_REL_EQ, BT_REL_LT, BT_REL_LE, BT_REL_GT, BT_REL_GE }; btree *bt_new(cmpfn_t cmp, copyfn_t copy, freefn_t freeelt, int propsize, int propalign, propmakefn_t propmake, propmergefn_t propmerge, void *state, int mindegree); void bt_free(btree *bt); btree *bt_clone(btree *bt); int bt_count(btree *bt); bt_element_t bt_index(btree *bt, int index); bt_element_t bt_index_w(btree *bt, int index); bt_element_t bt_findrelpos(btree *bt, bt_element_t element, cmpfn_t cmp, int relation, int *index); bt_element_t bt_findrel(btree *bt, bt_element_t element, cmpfn_t cmp, int relation); bt_element_t bt_findpos(btree *bt, bt_element_t element, cmpfn_t cmp, int *index); bt_element_t bt_find(btree *bt, bt_element_t element, cmpfn_t cmp); bt_element_t bt_propfind(btree *bt, searchfn_t search, void *sstate, int *index); bt_element_t bt_replace(btree *bt, bt_element_t element, int index); void bt_addpos(btree *bt, bt_element_t element, int pos); bt_element_t bt_add(btree *bt, bt_element_t element); bt_element_t bt_delpos(btree *bt, int pos); bt_element_t bt_del(btree *bt, bt_element_t element); btree *bt_join(btree *bt1, btree *bt2); btree *bt_joinr(btree *bt1, btree *bt2); btree *bt_splitpos(btree *bt, int index, int before); btree *bt_split(btree *bt, bt_element_t element, cmpfn_t cmp, int rel); #endif /* BTREE_H */ tweak-3.01/tweak.h0000644000175300017530000001003410433027777014346 0ustar simonsimon00000000000000#ifndef TWEAK_TWEAK_H #define TWEAK_TWEAK_H #ifndef NO_LARGE_FILES #ifndef _LARGEFILE_SOURCE #define _LARGEFILE_SOURCE #endif #ifndef _FILE_OFFSET_BITS #define _FILE_OFFSET_BITS 64 #endif #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #define EVER ;; #ifdef MSDOS #define ABORT 34 /* scan code for ^G */ #else #define ABORT 7 /* character code for ^G */ #endif #define VER "3.01" /* version */ #define SEARCH_BLK 65536 /* so can this */ #define SAVE_BLKSIZ 32768 /* and this too */ #define COL_BUFFER 0 /* normal buffer colour */ #define COL_SELECT 1 /* selected-area colour */ #define COL_STATUS 2 /* status-line colour */ #define COL_ESCAPE 3 /* escape sequences in minibuffer */ #define COL_INVALID 4 /* invalid escape sequence in m/b */ #define COL_MINIBUF COL_BUFFER /* these should be the same */ #define NULL4 NULL, NULL, NULL, NULL #define NULL16 NULL4, NULL4, NULL4, NULL4 #define NULL64 NULL16,NULL16,NULL16,NULL16 #define NULL256 NULL64,NULL64,NULL64,NULL64 #include #ifdef NO_LARGE_FILES typedef int fileoffset_t; /* used for all file offsets */ #define OFF #define ATOOFF atoi #define STRTOOFF strtol #define fseeko fseek #define ftello ftell #else typedef long long fileoffset_t; /* used for all file offsets */ #define OFF "ll" #define ATOOFF atoll #define STRTOOFF strtoll #endif typedef int (*DFA)[256]; typedef struct { int len; DFA forward, reverse; } Search; typedef void (*keyact) (void); typedef struct buffer buffer; extern char toprint[256], hex[256][3], message[80]; extern char decstatus[], hexstatus[], *statfmt; extern char last_char, *pname, *filename; extern buffer *filedata, *cutbuffer; extern int fix_mode, look_mode, insert_mode, edit_type, finished, marking; extern fileoffset_t file_size, top_pos, cur_pos, mark_point; extern int scrlines, modified, new_file; extern fileoffset_t width, offset, realoffset; extern int ascii_enabled; #ifdef unix extern volatile int safe_update, update_required; extern void update (void); #endif extern void fix_offset(void); extern fileoffset_t parse_num (char *buffer, int *error); extern void draw_scr (void); extern int backup_file (void); extern int save_file (void); extern void act_self_ins (void); extern keyact parse_action (char *); extern void proc_key (void); extern void bind_key (char *, int, keyact); extern Search *build_search (char *, int); void free_search(Search *s); extern int get_str (char *, char *, int); extern int parse_quoted (char *); extern void suspend (void); extern void read_rc (void); extern void write_default_rc (void); extern buffer *buf_new_empty(void); extern buffer *buf_new_from_file(FILE *fp); extern void buf_free(buffer *buf); extern void buf_insert_data(buffer *buf, void *data, int len, fileoffset_t pos); extern void buf_fetch_data(buffer *buf, void *data, int len, fileoffset_t pos); extern void buf_overwrite_data(buffer *buf, void *data, int len, fileoffset_t pos); extern void buf_delete(buffer *buf, fileoffset_t len, fileoffset_t pos); extern buffer *buf_cut(buffer *buf, fileoffset_t len, fileoffset_t pos); extern buffer *buf_copy(buffer *buf, fileoffset_t len, fileoffset_t pos); extern void buf_paste(buffer *buf, buffer *cutbuffer, fileoffset_t pos); extern fileoffset_t buf_length(buffer *buf); extern void display_setup(void); extern void display_cleanup(void); extern void display_beep(void); extern int display_rows, display_cols; extern void display_moveto(int y, int x); extern void display_refresh(void); extern void display_write_str(char *str); extern void display_write_chars(char *str, int len); extern void display_define_colour(int colour, int fg, int bg, int reverse); extern void display_set_colour(int colour); extern void display_clear_to_eol(void); extern int display_getkey(void); extern int display_input_to_flush(void); extern void display_post_error(void); extern void display_recheck_size(void); #endif /* TWEAK_TWEAK_H */ tweak-3.01/btree.but0000644000175300017530000007741710433027777014721 0ustar simonsimon00000000000000\cfg{html-leaf-level}{0} \cfg{chapter}{Section} \cfg{text-title-align}{left} \cfg{text-indent}{0} \cfg{text-chapter-numeric}{yes} \cfg{text-chapter-suffix}{. } \cfg{text-chapter-underline}{-} \cfg{text-section-numeric}{0}{yes} \cfg{text-section-suffix}{0}{. } \cfg{text-section-underline}{0}{-} \cfg{html-chapter-numeric}{yes} \cfg{html-chapter-suffix}{. } \cfg{html-section-numeric}{0}{yes} \cfg{html-section-suffix}{0}{. } \cfg{html-section-numeric}{1}{yes} \cfg{html-section-suffix}{1}{. } \title An Efficient Data Structure For A Hex Editor by \W{http://pobox.com/~anakin/}{Simon Tatham} \C{intro} Introduction Hex editors have been around for a long time, and at the very basic level they are very simple to write. Since they are mostly used for editing files such as executables, which contain a lot of cross-references to particular byte positions in the file, a hex editor need not have an insert mode in order to be useful. And a hex editor without an insert mode is very easy to implement: you simply allocate a large enough array for the input file, and use that as your data structure. The only operation you really need to be able to do efficiently is to jump to a particular byte position, and that's precisely what an array makes easy. On the other hand, an insert mode can be useful in other circumstances. Not \e{all} types of file you might want to edit have the same restrictions as an executable. And as soon as you want your hex editor to have an insert mode, the data structure question becomes much more interesting. In this article I present an efficient and scalable data structure which supports all the operations needed by a hex editor. \C{simple} Simple options One technique used to support insert mode in editors is to use an array larger than the file size, with a gap in it. The file contents up to the current cursor position are stored at the start of the array; the file contents from the current cursor position to the end are stored at the end of the array; and the gap in the middle moves about as the cursor does. This makes insertion easy. When the user inserts an extra character, you just add it to one end or other of the gap. On the other hand, \e{moving} through the file now becomes a slow operation; it's not noticeable when you're moving by a byte, by a line, or even by a screenful at a time, but as soon as you try to jump to the start or end of the file, or jump to a particular specified file offset, suddenly the editor has to bodily shift enormous amounts of file data from one end of the gap to the other. Another slightly better option is to use a linked list of small arrays, and to let the arrays vary in size between K and 2K bytes, for some fixed minimum block size K. Inserting a single byte in the middle of a block doesn't cost too much; occasionally the block will grow beyond size 2K and have to be split into two smaller ones, but even that isn't too slow. Jumping to a particular position, however, is still an O(N) operation using this structure. In practice it isn't \e{too} bad, since the length of the linked list is at worst 1/K times the size of the file; but once the file size becomes seriously big, this approach does not scale well. The common problem in both these methods is that as soon as you make insertion a constant-time operation, seeking to a given byte position becomes linear-time. Whereas in the original array format, of course, seeking was constant-time but \e{insertion} became linear-time. \C{trees} Using balanced trees This is where trees come in. Balanced tree structures (any of AVL trees, red-black trees and B-trees) all solve this sort of problem for sorted lists. You can insert an element into a balanced tree in \e{log} time, and you can search for a particular element in log time as well. This sounds like the kind of compromise we want: if making insertion constant-time forces seeking to be linear and vice versa, we would prefer to arrange for \e{both} to be log-time. The conventional use of a balanced tree to store a sorted list, however, is not immediately helpful to us. The only criterion we could reasonably sort on would be byte position in the file; and as soon as we store our data as a set of (position, data) pairs, we're back to insertion being linear again, because we would have to alter the position field of every tree element after the insertion point. Is there anything we can do to our balanced trees to make this work better? \C{counted-trees} Counted trees Yes, there is. Suppose you add an additional field to every node of a balanced tree. In that field, you store a count of the number of elements \e{in or below} that node. Operations which alter the tree (insertion and deletion) now have to make sure these counts remain accurate. This can be done without sacrificing the log-time characteristics of the operations. For example, when you add an element, you increment the count of the node containing it, and then work back up the tree to the root incrementing the counts in all the nodes you go past. Since the height of the tree is O(log N), this only takes you O(log N) time. So we can add counts to a tree and still maintain it efficiently. What have the counts bought us? Once we have counts in a tree, they introduce an entirely new way to \e{search} the tree. Starting at the root, we can search down the tree by examining the count fields rather than comparing elements as usual; and this allows us to find the Nth item in the tree, for any N, in a single log-time search. For example, suppose the root tree node contains a child with count 54, then an actual element, then a child with count 73. Then: \b If you are trying to get to a position less than 54, then you descend straight to the leftmost child. \b If you are trying to get to \e{exactly} position 54, you return the element out of the root node. \b If you are trying to get to position 55 or greater, you descend to the rightmost child, and subtract 55 from your desired position. (If you want element 57 of the tree, then you know there are 55 elements in the tree before the right-hand subtree, so you know you want element 2 of the right-hand subtree.) So now we have a means of finding the Nth item in a tree in a log-time search. This is starting to look promising. The trouble is, we're still stuck with having some sort of sorting order on the tree. Now we need to deal with that. \C{unsorted-trees} Unsorted trees The simple answer to the sorting problem is to do away with sorting the tree at all! Conventional balanced trees have a sorting order because it's used to find elements in the tree, and to know where to add an element. But we don't need a sorting order to find things any more, because we can use a count-based search to jump to the Nth position. Can we also use counts during the tree add operation, to allow us to specify \e{where} we want to add our new element? We can. Tree add algorithms start by searching down the tree to find the position where the new element will be inserted. If we do this search using counts, in exactly the same way described in \k{counted-trees}, then we can add any element we like at any position in the tree. Once we do this, of course, we have to throw out the sorting order completely, and never do another order-based search or insertion again, because they won't work. But that's OK, because we didn't need them anyway. Now we have exactly what we were after in the first place. We have a data structure which stores an unordered list of items, in such a way that we can insert or delete an item in log time \e{and} find the Nth element in log time. \C{splitjoin} Splitting and joining trees Now we can begin to get more ambitious. One issue we have not addressed yet is cut and paste. So far I have discussed tree insertion in the assumption that you only ever insert one character at a time into your tree. In fact hex editors need cut and paste just as much as normal text editors do; so we must think about how to insert or remove a larger block of data at a time. One obvious way is to process each byte individually. A ten-byte cut operation is ten individual deletions, and a ten-byte paste is ten individual insertions. This is fine if you only ever use cut and paste to move tiny chunks of data around a large file, but if you need to move \e{half the file} from one place to another, things get more interesting. The linked-list structure discussed in \k{simple} would have helped a lot with this problem. Linked lists don't just make it easy to insert or delete one item: they make it just as easy to unlink an enormous chunk of a list once you've found both ends of the chunk, and you can link that chunk in somewhere else easily as well. It turns out that you \e{can} do the same thing with balanced trees. At this point it starts to make a difference what kind of balanced tree you use: all three of AVL, red-black and B-trees support these operations, but the precise methods vary between them. I'm going to use B-trees from here on, because the algorithms are slightly simpler. What we need are two basic operations. Given a counted, unsorted B-tree containing an unordered list of items, we need to be able to: \b Split the tree down the middle, giving two valid B-trees as output. \b Take two valid B-trees and join them together end-to-end, giving one B-tree containing all the data from tree A followed by the data from tree B. This will provide all the operations we need. To unlink a large section from the middle of a tree, we split it in two places and then join the outer two parts back together; to link a large section \e{into} the middle of a tree, we split it at the insertion point, join the left half on to the left side of the inserted section, and join the right half on to the right side of the inserted section. \H{joining} Joining two B-trees together When you add an element to a B-tree, sometimes it ends up increasing the size of a leaf node beyond the size limit. When that happens, you deal with it by splitting the node in two, and transforming the parent node so that where it previously had a single child pointer, it now has two child pointers with an element between them. If that makes the parent node too big as well, you do the same thing again, and so on until you reach the tree root. Joining two B-trees is therefore reasonably simple, \e{if} you have an additional separating element to place in between them. Position the two trees so that their leaf nodes are at the same level; now (usually) one tree will be shorter than the other. So you can add the root of the shorter tree as a sibling of the node next to it in the taller tree; their common parent gains one extra child pointer (pointing at the root of the shorter tree), separated from its neighbour by the additional separating element. If this causes the node to increase beyond the maximum size, just split it in two and propagate up to its parent, just as in the ordinary insertion process. If the trees were originally the same height, just combine their root nodes into a single larger root node. You need an extra element to go in between the rightmost child pointer of the left-hand root node, and the leftmost child pointer of the right-hand root node; and again, this is where your separating element comes in. Again, if the new root is too big to be a single node, split it in two and create a new root above it. So it turns out that it's very easy to join two trees together, but the algorithm requires a spare element to go in the middle. However, we normally don't have such a spare element: we just have two trees. This is easily solved, though: we simply start by removing the leftmost element of the right-hand tree using the ordinary tree deletion algorithm. Then we just do the join algorithm, as described above, using the element we just removed as our separator. \H{splitting} Splitting a B-tree in two To split a B-tree in two: we are given a tree, and a means of searching down the tree to find the split point. (In this application, that will be a numeric position, which we check against the node counts on the way down; in other situations, we might perfectly well want to split an ordinary \e{sorted} B-tree in half, so we might have an ordering-based search criterion. It makes no difference.) We start in the simplest possible way. Start at the root node; decide which of its subtree pointers you are going to descend down; and saw the node in half at that subtree pointer. The two half-nodes thus created will \e{each} need a subtree pointer to go on the cut end, but that's OK because we're about to saw the next node down in half as well and they can have half each. So descend to the next node, decide on a split point again, saw that node in half, and put a pointer to each half in the two halves of the parent node. Once we finish this searching-and-cutting pass, we will have successfully separated our tree into two parts at the required point. However, the result will almost certainly not be a pair of \e{valid} B-trees; the chances are that many of the nodes on the cut edges will be below the minimum allowed node size. In fact, if at any point our search criterion made us descend through the \e{endmost} subtree pointer in any node, some of those nodes will have no elements in them whatsoever, just a single subtree pointer! So now we must make a healing pass down the cut edge of each tree, to turn it back into a valid B-tree. We can start by throwing away the root node if it has nothing but a single subtree pointer (which will happen quite often if we split near one end of the original tree, since in that case the output trees will almost certainly need to be of different heights). Keep doing that until we find a real root node. One child of that node is on the cut edge, so it may be below the minimum size. If it is, we solve this using its (valid) neighbour node. If the neighbour is large, we can move some subtrees over into the undersized node to make two correctly sized nodes; if the neighbour is too small and does not have that many subtrees to spare, we can instead \e{combine} the undersized node with its neighbour. (And it turns out you can always do at least one of these: if the neighbour is too large to combine with the undersized node, then it \e{must} have enough subtrees for redistribution to give two viable nodes.) The only interesting case is that combining an undersized node with its neighbour reduces the number of subtrees of their common parent by one. Therefore: \b As we go down, we arrange for each node on the cut edge to be at least \e{one more than} minimum size, in case its size must drop by one when we process its child. (This still just about works in all cases.) \b If the first non-trivial root node had only two children (recall that the root node in a B-tree is the only node exempt from the minimum size limit), and those two children end up having to be combined, then the root node must be thrown away again and the combined node is the new root. Once we have sorted out each node, we descend to its child on the cut edge, and do the same thing again. Eventually we reach the bottom of the tree and every node is of valid size. Then we do the same thing to the cut edge of the other tree, and we're done. \C{copy-on-write} Cloning trees The splitting and joining algorithms look as if they make cut and paste pretty much trivial. You can split a big chunk out of your editing buffer into a separate cut buffer easily enough; and then you can \q{paste} it somewhere else by joining it back into the middle of the editing buffer at a different position. However, in real life, cut and paste isn't that simple. People often want to paste the same data more than once; so you can't just link the cut buffer straight into the editing buffer, because then you don't still have it to link in again next time. You need to \e{copy} the cut buffer and link in the copy. Equally, users often want to press Copy rather than Cut, in which case you have to split the buffer tree in two places, \e{copy} the middle section, and join all three back together. Copying a tree, it would seem, is inherently an O(N) operation; there's no way you can copy a tree containing megabytes of data without actually copying all that data. Or is there? It turns out that we \e{can} do better than this, by adding another annotation field to each tree node. This time, the annotation is a \e{reference count}: it counts the number of pointers to the node, either from other tree nodes or from the \q{root} field in a tree header structure. To begin with, of course, all reference counts are 1. Reference counts are normally used for garbage collection. In this case, though, I'm going to use them to implement \e{copy-on-write}. All of the tree-altering algorithms (insertion and deletion, plus the split and join algorithms described above) will now check the reference count of a node before attempting to modify it. If they find that they need to modify a node with a reference count greater than one, they will not modify it. Instead, they will make a copy of that node, and use the copy in place of the original. The copy links to all the same child nodes as the original, so the reference count in each child must be incremented; and the copied node's parent (or tree header structure) now links to the copy rather than to the original, so the reference count in the original must be decremented. Now we are looking at a node with a reference count of 1, which means nobody else is using it so we can modify it safely. The effect of this is that it is now a trivial - not merely log-time but \e{constant}-time - operation to \e{clone} an entire B-tree, no matter how large. We simply create a new tree header structure; we point its root field at the root node of the input tree; and we increment the reference count on that root node. Once we have cloned a tree like this, we can treat the original and the clone as if they were entirely independent. If you add an element to one of them, for example, then a single string of nodes from the root down to one leaf will be duplicated and modified, but the rest of the trees will still be held in common. You can split either tree into lots of little pieces, or join it into the middle of a larger one, and never affect the data stored in what was once its clone, because every time you touch a node that the other tree is depending on, you make your own copy rather than disturbing it. This allows us to support \e{really} efficient cut and paste in our hex editor. You select a 200Mb chunk and press Copy; the buffer tree is split in two places (in log time), the middle section is cloned (instantly), and the tree is joined back together. You'd hardly know anything was different - but the cut buffer now contains a clone of \e{part} of the original buffer, most of which consists of nodes that are still shared with it. And you can paste in as many copies as you like of that chunk, still in no worse than O(log N) time. The best bit is that by the time you've done this a few times and have a file that's 1600Mb longer than it started out, the hex editor isn't actually using up 1600Mb more memory, because most of it is in shared nodes! This technique naturally provides a form of compression as well as being fast. \C{lazy-loading} Lazy file loading In all of the above I have been tacitly assuming that the data elements stored in my tree are individual bytes. This would be hideously inefficient if I were using AVL or red-black trees, in which each node contains precisely one element: for every \e{byte} of the file being edited, there would be an overhead of two child pointers, a byte count and a reference count. On a normal 32-bit machine, that's 20 bytes per node, not counting overhead from the memory allocator. A factor of twenty is just ridiculous. B-trees are a bit more flexible, since they can be made to have a large minimum degree. A B-tree with a minimum node size of (say) 512 can contain up to 1023 bytes of data plus 1024 subtree pointers, and those 1023 bytes can be packed together in memory so the overhead is now more like a factor of five. Also, since no node in a B-tree ever changes its height above ground level, you can just not bother to allocate space for the 512 NULL child pointers in your leaf nodes, and since the vast majority of your nodes will \e{be} leaf nodes, the structure is now closer to being space-efficient. There are other improvements one could make. For example, there's no reason why a B-tree really needs to have the \e{same} minimum node degree at every level; so you could have low-degree nodes everywhere above the leaf level, and enormous leaf nodes containing 4-8Kb of file data. You could move to B+ trees in which no actual data elements were stored anywhere except in the leaf nodes, thus saving the tiny alignment overheads in the other nodes. However, there's a better direction to head in. In \k{simple} I mentioned the idea of using a linked list as the main data structure, and I said that each element of the linked list would be a smallish array of file bytes (between size K and 2K). There's no reason we couldn't do that in our B-tree-based approach: each element stored in the B-tree is no longer a single byte but a small block of bytes. It would mean that our element counts no longer allowed us to jump to the Nth byte, only to the Nth \e{block}; but we can fix that by replacing the element count with a byte count, summing the total \e{size} of all the blocks in or below a particular tree node. Now, given any byte position, we can do a single log-time search and return a data block plus an offset within that block. This technique adds work to all operations. Inserting a byte, for example, is now done by finding the block it needs to go into, inserting it in that block, and potentially splitting the block into two and doing an extra tree operation. Splitting and joining buffers involves splitting and joining blocks at each end, and checking to make sure undersized blocks are not created. So what does this technique buy us, that makes it worthwhile over just storing single bytes in the B-tree? The answer is: once we have a block data structure as our tree element, we can start having different \e{types} of block. In particular, we can have a type of block which is a placeholder, containing nothing but a file offset and length. A block of this type indicates \q{at this point in the tree we have N bytes from position P in the original file}. Blocks of this type are exempt from the normal maximum size for normal literal-data blocks. The effect of this is that we no longer need to read the entire file into memory when we start up. Instead, we just initialise our tree trivially, so that it contains nothing but a single placeholder block, with offset zero and size equal to the initial file size. Now whenever we need to read data from the tree, and it turns out the data in question is somewhere in a placeholder block, we must refer back to the original input file in order to find the data (and the placeholder block will tell us where in the file to read it from). So before we do any editing, our hex editor is suddenly a low-cost hex \e{file viewer}, which just pages back and forth and refers to the disk all the time. But as soon as we start altering parts of the file, the placeholder block gets broken up into smaller blocks, and literal-data blocks begin to be created in between them. If we cut and paste a section including a placeholder block, then the tree can end up containing placeholder blocks in a strange order; it might (for example) indicate something like \q{the first 192K of the input file; then the literal bytes 5B 49 A7; then 25K of the input file starting from position 12345; then 512K of the input file starting from position 204325}. Now the hex editor \e{looks} as if it's doing exactly the same thing as it did to begin with. I can page around the original file; I can insert, delete, overwrite, cut, copy and paste to my heart's content, and (provided no other process modifies the original file under our feet) the data I am manipulating will remain consistent at all times with the editing operations I have performed. But there wasn't a big delay at startup when the file was loaded in, because most of it \e{wasn't} loaded in; and if I list the running processes on my system, the hex editor will not be using memory proportional to the size of the file. It will only be using memory proportional to the \e{changes} I've made to the file. When I save the file, if there are any placeholder blocks remaining in the buffer tree, the hex editor must write out the new version by referring to the original. This is the \e{only} remaining operation, apart from searching, that takes time proportional to the size of the file. And there are \e{no} remaining operations which take \e{memory} proportional to the size of the file. (There is one thing you need to be careful of. Literal data blocks must be permitted to fall below the minimum size K if there is no literal block next to them to merge with; in particular, this is vital if you are writing a binary file from scratch or you would never be able to give it a size between zero and K. But this raises the possibility that given a pathological sequence of editing operations, your data structure might end up being an interleaving of one-byte literal blocks and one-byte placeholder blocks, giving a huge space overhead. The simplest solution to this is to impose a minimum size of 2K on \e{placeholder} blocks, below which you read the relevant piece of file data and convert them into literal blocks; then they can be merged with adjacent blocks and the worst case is no longer terrible.) We now have a data structure which does pretty much everything you could reasonably ask a hex editor to be able to do, and does it all at a reasonable memory cost and (apart from the two genuinely necessary operations of searching and saving) \e{all} in O(log N) time. \C{further} Further directions The data structure as I have presented it is suitable for use in a high-performance hex editor with an insert mode. There are a couple more points worth noting. \H{further-texted} Conventional text editing This structure would need only minor modifications to be an efficient basis for a conventional text editor. In order to do this, you would need to be able to jump quickly to a particular \e{line} of the file, which means you'd need a node annotation counting newlines. In fact, it's possible to do slightly better than that: we can devise a more complex node annotation which tracks the effect of an arbitrary byte sequence on the (line, column) position. Assuming that a physical tab character always advances the cursor to the next multiple of 8 spaces, there are three possibilities: \b A sequence of bytes containing no newlines or tabs simply adds some number A to the column number, and does not affect the line number. \b A sequence of bytes containing no newlines but at least one tab has the overall effect of adding some number A to the column, and rounding it up to the next number that is congruent to B mod 8. \b A sequence of bytes containing at least one newline has the effect of adding some number A to the \e{line} number, and setting the column number to a fixed value B. These three function schemas are closed under composition (i.e. combining any two of them gives another one). Storing one in each node of a buffer tree would provide the ability to search directly to \e{a particular (line, column) position} in a single log-time search. So the text editor could treat its buffer as a simple sequence of bytes (or possibly of Unicode characters). This is superior to treating the buffer as a sequence of lines, because it removes the distinction between inserting \e{within} a line and inserting data \e{between} lines. In particular, cut and paste in a line-based model is fiddly because lines must be spliced together at each end of the pasted region; but cut and paste in this model is as trivial as it was in the hex editor - you just cut a sequence of bytes, paste it somewhere else, and the line/column indexing automatically keeps up no matter what you do. The only snag is that if you did this, you would probably no longer be able to do the trick with placeholder blocks and lazy file loading; a text editor tends to need to know in advance where all the newlines are in its buffer, so there would probably be no alternative to physically loading the file. But in that, at least, this data structure is no worse than any other. \H{undo} Supporting undo An undo function in an editor \e{conceptually} stores a sequence of previous buffer states, and allows you to return to one of them when you need to. Usually, this is not actually implemented by storing copies of the entire buffer, since that would be ludicrously wasteful of space! Instead, a journal of changes is kept which allows previous buffer states to be \e{reconstructed} by reversing the precise changes made. One could do that using this data structure, if one wanted to. However, there's another intriguing option. Since cloning an arbitrarily large tree is a cheap operation, you could implement undo by \e{actually} storing a sequence of clones of previous buffer states! The cost of this would be nothing like as bad as it would na\u00EF{i}vely appear. It might still not be ideal, though. Every time you clone a tree and the two clones diverge, several nodes must be copied, and if each node contains several blocks of literal data then the cost of maintaining too many buffer clones might still become prohibitive. But it's an interesting possibility regardless. \C{summary} Summary I've presented a design for a data structure which implements practically every operation required for a hex editor in O(log N) time, apart from one or two which genuinely \e{need} to be O(N). The structure is: \b A B-tree, each of whose elements is either a small array of literal data bytes, or a placeholder block denoting a section of the unmodified input file. \b Each B-tree node is annotated with the total byte count of all the elements in or below that node, allowing a log-time search to pinpoint any numeric byte position. \b Those counts provide the only necessary means of navigating the tree, so there is no need for a sorting criterion. \b Split and join algorithms make it possible to link and unlink large chunks from the middle of a buffer at a time. \b Reference counts implementing copy-on-write allow cloning of chunks in constant time. As a result: \b Inserting or deleting bytes in the file is a log-time operation. \b Finding a particular byte position is a log-time operation. \b Cut and paste is always log-time, no matter how large or complex the chunk of data being moved around. \b Memory usage grows proportionally to the \e{changes} made to the file, not the overall file size. (However, memory usage is also \e{bounded} by a value proportional to the file size, even if you keep editing and re-editing for ever.) Searching must still be linear (there's no alternative to actually reading the data if you need to know anything about its contents), and saving the modified output file is linear (because you actually must physically write out that much data), but \e{everything} else can be done in log time. I've also sketched a means of converting this into a data structure for an ordinary text editor, and suggested interesting implications in the area of undo operations. \C{ref} References Donald Knuth's \q{The Art of Computer Programming} (\W{http://en.wikipedia.org/w/wiki.phtml?title=Special:Booksources&isbn=0201485419}{Addison-Wesley, ISBN 0201485419}) presents at least some of the same ideas as this article. Counted and unsorted trees are mentioned in volume 3; splitting and joining are also described (although Knuth does them on AVL trees, which are significantly more fiddly to split than B-trees; you have to cut the tree into lots of little pieces, and then put them all back together by using the join algorithm repeatedly). \q{Tweak}, a hex editor implementing this data structure, can be downloaded at \W{http://www.chiark.greenend.org.uk/~sgtatham/tweak/}\cw{http://www.chiark.greenend.org.uk/~sgtatham/tweak/}. \versionid $Id: btree.but 4828 2004-11-19 18:48:59Z simon $ tweak-3.01/manpage.but0000644000175300017530000002544110433027777015216 0ustar simonsimon00000000000000\cfg{man-identity}{tweak}{1}{2004-11-05}{Simon Tatham}{Simon Tatham} \cfg{man-mindepth}{1} \C{tweak-manpage} Man page for \cw{tweak} \H{tweak-manpage-name} NAME \cw{tweak} - efficient hex editor \H{tweak-manpage-synopsis} SYNOPSIS \c tweak [-l | -f] [-e] [-w width] [-o offset] filename \e bbbbb bb bb bb bb iiiii bb iiiiii iiiiiiii \H{tweak-manpage-description} DESCRIPTION \cw{tweak} is a hex editor. It allows you to edit a file at very low level, letting you see the full and exact binary contents of the file. It can be useful for modifying binary files such as executables, editing disk or CD images, debugging programs that generate binary file formats incorrectly, and many other things. Unlike simpler hex editors, \cw{tweak} possesses a fully functional insert mode. This is not useful when editing many of the types of file described above, but can be useful in other situations. Also, an insert mode makes it easy to use \cw{tweak} to construct new files from scratch. When you open a file in \cw{tweak}, you can expect to see the screen contents looking something like this: \c 00000000 7F 45 4C 46 01 01 01 00 .ELF.... \c 00000008 00 00 00 00 00 00 00 00 ........ \c 00000010 02 00 03 00 01 00 00 00 ........ \c 00000018 D0 8E 04 08 34 00 00 00 ....4... \c 00000020 2C EF 01 00 00 00 00 00 ,....... The central column shows you the hexadecimal value of each byte in the file you are editing. The column on the right shows the ASCII interpretation of those bytes, where applicable. In the example above, the sequence \c{45 4C 46} on the first line translates into the ASCII upper-case letters \q{ELF}, but the subsequent sequence \c{01 01 01 00} does not have any printable ASCII representation and so the right-hand column simply prints dots. The column on the left shows the position within the file of the start of each row. In fact, when you start \cw{tweak}, you will usually see 16 bytes of the file per row, not 8 as shown above. However, this is configurable if your screen is narrower - or wider - than the usual 80 columns, or if the file you are editing consists of fixed-size records of some other size. By default, \cw{tweak} does not load its entire input file into memory. Instead, it loads it \e{lazily}, reading from the file on disk when you request a view of a part of the file it doesn't have stored. When you modify the file, it stores your modifications in memory, but continues to refer to the original disk file for the parts you have not touched. This means you can edit extremely large files (for example, entire CD images) without difficulty; opening such a file is instantaneous, making modifications causes \cw{tweak}'s memory usage to grow with the size of the changes rather than the size of the whole file, and only when saving the altered version will \cw{tweak} have to read through the entire input file to write the output. However, this mode of operation has a disadvantage, which is that if the input file is modified by another program while \cw{tweak} is running, \cw{tweak}'s internal data structures will not be sufficient to keep track, and it is likely that the file written out will contain a mixture of the old and new contents of the input file. Therefore, you can disable this lazy loading if you need to; see the \cw{-e} option below. \H{tweak-manpage-options} OPTIONS This section lists the command-line options supported by \cw{tweak}. \dt \cw{-f} \dd Runs \cw{tweak} in \q{fix} mode, i.e. with the insert function entirely disabled. This might be useful if you are editing a file in which the insert mode is of no use (executables, for example, tend to have strong dependencies on precise file offsets which make it almost impossible to insert data in one without rendering it unusable) and you want to avoid turning it on by mistake. \dt \cw{-l} \dd Runs \cw{tweak} in \q{look} mode. In this mode \cw{tweak} does not allow you to modify the data at all; it becomes simply a tool for examining a file in detail. \dt \cw{-e} \dd Runs \cw{tweak} in \q{eager} mode. In this mode \cw{tweak} will read its entire input file when starting up. This causes it to take up more memory, but means that it has no dependency on the input file remaining unmodified, and other programs can alter it if they need to without causing trouble. \dt \cw{-w} \e{width} \dd Specifies the number of bytes \cw{tweak} will display per line. The default is 16, which fits neatly in an 80-column screen. \dt \cw{-o} \e{offset} \dd If this option is specified, \cw{tweak} will ensure that the given file offset occurs at the start of a line. For example, if you loaded a file using the options \cw{-w 8 -o 0x13}, you might see a display a bit like this: \lcont{ \c 00000000 7F 45 4C .EL \c 00000003 46 01 01 01 00 00 00 00 F....... \c 0000000B 00 00 00 00 00 02 00 03 ........ \c 00000013 00 01 00 00 00 D0 8E 04 ........ \c 0000001B 08 34 00 00 00 2C EF 01 .4...,.. By putting only three bytes of the file on the very first line, \cw{tweak} has arranged that the file offset 0x13 (19 in decimal) appears at the beginning of the fourth line. You might use this option if you knew you were editing a file in a particular format. For example, if your file contained a 53-byte header followed by a series of 22-byte records, you might find it useful to specify the options \cw{-w 22 -o 53}. This would arrange that after the header, each individual record of the file would appear on precisely one line of \cw{tweak}'s display. } \dt \cw{-D} \dd If this option is specified, \cw{tweak} will not attempt to load and edit a file at all, but will simply produce its default \cw{.tweakrc} file on standard output. This is a useful way to give yourself a starting point if you want to begin reconfiguring \cw{tweak}'s keyboard layout. \H{tweak-manpage-keys} KEYS This section describes all the editing keys supported by \cw{tweak} by default. The default key bindings for \cw{tweak} are basically Emacs-like. \S{tweak-manpage-keys-movement} Movement keys The Emacs cursor movement keys should all work, and their counterparts in ordinary function keys ought to work too: \b \cw{^P} and \cw{^N} go to the previous and next lines; Up and Down should do the same. \b \cw{^B} and \cw{^F} go back and forward one character; Left and Right should do the same. \b \cw{M-v} and \cw{^V} go up and down one screenful at a time; Page Up and Page Down should do the same. \b \cw{^A} and \cw{^E} go to the beginning and end of the line; Home and End should do the same. Press \cw{M-<} and \cw{M->} go to the beginning and end of the file. Press \cw{^X g} to go to a particular byte position in the file; you will be asked to type in the position you want. You can enter it in decimal, or as a hex number with \cq{0x} before it. \S{tweak-manpage-keys-editing} Editing keys Press Return to move the cursor between the hex section of the screen and the ASCII section. When in the hex section, you can enter hexadecimal digits to alter data; when in the ASCII section, you can directly type ASCII text. In ASCII mode, you can also press \cw{^Q} to literally quote the next input character; for example, if you want to insert a Control-V, you can press \cw{^Q^V} and \cw{tweak} will automatically insert the byte value 0x16. Press \cw{^X^I}, or the Insert key if you have one, to toggle between overwrite mode and insert mode. In insert mode, typing hex or ASCII input will insert new bytes containing the values you provide. Also, you can then press Backspace to delete the byte to the left of the cursor, or \cw{^D} or Delete to delete the byte under the cursor. \S{tweak-manpage-keys-cnp} Cut and paste Press \cw{^@} (this character may be generated by the key combination Control-@, or Control-2, or Control-Space) to mark the end of a selection region. After you do this, the bytes between that mark and the cursor will be highlighted. Press \cw{^@} again to abandon the selection. Press \cw{M-w} while a selection is active to copy the selected region into \cw{tweak}'s cut buffer. In insert mode, you also have the option of pressing \cw{^W} to \e{cut} the selected region completely out of the file and place it in the cut buffer. Finally, press \cw{^Y} to paste the cut buffer contents back into the file (this will overwrite or insert depending on the current mode). \S{tweak-manpage-keys-search} Searching Press \cw{^S} to search for a byte sequence. You will be asked to enter some text to search for on the bottom line of the screen. You can type this text in ASCII, or as a sequence of hex byte values prefixed with backslashes (\cw{\\}). For example, if you wanted to search for the byte value 5 followed by the word \q{hello}, you might enter \cw{\\05hello}. If you want to specify a literal backslash character, you can either enter it in hex (as \cw{\\5C}), or simply double it on input (\cw{\\\\}). Press \cw{^R} to search backwards instead of forwards from the current cursor position. Since \cw{tweak} deals in pure binary data, searches are always case-sensitive. \S{tweak-manpage-keys-display} Controlling the display If you press \cw{^X w}, you will be asked to enter a new display width. This has the same effect as passing the \cw{-w} option on the command line. Similarly, pressing \cw{^X o} allows you to enter a new display offset, equivalent to the \cw{-o} option. By default, the current file position and file size are displayed on \cw{tweak}'s status line in hex. If you prefer them in decimal, you can press \cw{^X x} or \cw{^X h} to toggle them between hex and decimal. \S{tweak-manpage-keys-misc} Miscellaneous Press \cw{^L} to redraw the screen and recentre the cursor. Press \cw{^Z} to suspend \cw{tweak} and return temporarily to the shell. Press \cw{^X^S} to save the file you are editing. Press \cw{^X^C} to exit \cw{tweak}. (If you do this with changes unsaved, you will be asked whether you want to save them.) \H{tweak-manpage-cfg} CONFIGURATION FILE \cw{tweak}'s keyboard bindings are configurable. It will attempt to read a file from your home directory called \cw{.tweakrc}, and if it finds one it will use the keyboard bindings described in it. If it does not find one, it will use its internal default bindings. Most of the directives in \cw{.tweakrc} are of the form \cq{bind command-name key}. For example, \cq{bind exit ^X^C}. Additionally, there are two other directives, \cw{width} and \cw{offset}, which give the default display parameters if no \cw{-w} and \cw{-o} options are specified. The easiest way to learn about the \cw{.tweakrc} file is to begin by having \cw{tweak} output its internal default one: \c tweak -D > $HOME/.tweakrc Then you can read the default file, learn the \cw{tweak} internal command names, and edit the file to do what you want. \H{tweak-manpage-bugs} BUGS This man page probably ought to contain an explicit list of internal command names, rather than simply referring you to the default \cw{.tweakrc}. tweak-3.01/tweak.10000644000175300017530000002523210433030011014234 0ustar simonsimon00000000000000.TH "tweak" "1" "2004-11-05" "Simon Tatham" "Simon Tatham" .SH "NAME" .PP \fBtweak\fP - efficient hex editor .SH "SYNOPSIS" .PP .nf \fBtweak\fP [\fB-l\fP | \fB-f\fP] [\fB-e\fP] [\fB-w\fP \fIwidth\fP] [\fB-o\fP \fIoffset\fP] \fIfilename\fP .fi .SH "DESCRIPTION" .PP \fBtweak\fP is a hex editor. It allows you to edit a file at very low level, letting you see the full and exact binary contents of the file. It can be useful for modifying binary files such as executables, editing disk or CD images, debugging programs that generate binary file formats incorrectly, and many other things. .PP Unlike simpler hex editors, \fBtweak\fP possesses a fully functional insert mode. This is not useful when editing many of the types of file described above, but can be useful in other situations. Also, an insert mode makes it easy to use \fBtweak\fP to construct new files from scratch. .PP When you open a file in \fBtweak\fP, you can expect to see the screen contents looking something like this: .PP .nf 00000000 7F 45 4C 46 01 01 01 00 .ELF.... 00000008 00 00 00 00 00 00 00 00 ........ 00000010 02 00 03 00 01 00 00 00 ........ 00000018 D0 8E 04 08 34 00 00 00 ....4... 00000020 2C EF 01 00 00 00 00 00 ,....... .fi .PP The central column shows you the hexadecimal value of each byte in the file you are editing. The column on the right shows the ASCII interpretation of those bytes, where applicable. In the example above, the sequence \fB45 4C 46\fP on the first line translates into the ASCII upper-case letters "ELF", but the subsequent sequence \fB01 01 01 00\fP does not have any printable ASCII representation and so the right-hand column simply prints dots. .PP The column on the left shows the position within the file of the start of each row. .PP In fact, when you start \fBtweak\fP, you will usually see 16 bytes of the file per row, not 8 as shown above. However, this is configurable if your screen is narrower - or wider - than the usual 80 columns, or if the file you are editing consists of fixed-size records of some other size. .PP By default, \fBtweak\fP does not load its entire input file into memory. Instead, it loads it \fIlazily\fP, reading from the file on disk when you request a view of a part of the file it doesn't have stored. When you modify the file, it stores your modifications in memory, but continues to refer to the original disk file for the parts you have not touched. This means you can edit extremely large files (for example, entire CD images) without difficulty; opening such a file is instantaneous, making modifications causes \fBtweak\fP's memory usage to grow with the size of the changes rather than the size of the whole file, and only when saving the altered version will \fBtweak\fP have to read through the entire input file to write the output. .PP However, this mode of operation has a disadvantage, which is that if the input file is modified by another program while \fBtweak\fP is running, \fBtweak\fP's internal data structures will not be sufficient to keep track, and it is likely that the file written out will contain a mixture of the old and new contents of the input file. Therefore, you can disable this lazy loading if you need to; see the \fB-e\fP option below. .SH "OPTIONS" .PP This section lists the command-line options supported by \fBtweak\fP. .IP "\fB-f\fP" Runs \fBtweak\fP in "fix" mode, i.e. with the insert function entirely disabled. This might be useful if you are editing a file in which the insert mode is of no use (executables, for example, tend to have strong dependencies on precise file offsets which make it almost impossible to insert data in one without rendering it unusable) and you want to avoid turning it on by mistake. .IP "\fB-l\fP" Runs \fBtweak\fP in "look" mode. In this mode \fBtweak\fP does not allow you to modify the data at all; it becomes simply a tool for examining a file in detail. .IP "\fB-e\fP" Runs \fBtweak\fP in "eager" mode. In this mode \fBtweak\fP will read its entire input file when starting up. This causes it to take up more memory, but means that it has no dependency on the input file remaining unmodified, and other programs can alter it if they need to without causing trouble. .IP "\fB-w\fP \fIwidth\fP" Specifies the number of bytes \fBtweak\fP will display per line. The default is 16, which fits neatly in an 80-column screen. .IP "\fB-o\fP \fIoffset\fP" If this option is specified, \fBtweak\fP will ensure that the given file offset occurs at the start of a line. For example, if you loaded a file using the options \fB-w 8 -o 0x13\fP, you might see a display a bit like this: .RS .PP .nf 00000000 7F 45 4C .EL 00000003 46 01 01 01 00 00 00 00 F....... 0000000B 00 00 00 00 00 02 00 03 ........ 00000013 00 01 00 00 00 D0 8E 04 ........ 0000001B 08 34 00 00 00 2C EF 01 .4...,.. .fi .PP By putting only three bytes of the file on the very first line, \fBtweak\fP has arranged that the file offset 0x13 (19 in decimal) appears at the beginning of the fourth line. .PP You might use this option if you knew you were editing a file in a particular format. For example, if your file contained a 53-byte header followed by a series of 22-byte records, you might find it useful to specify the options \fB-w 22 -o 53\fP. This would arrange that after the header, each individual record of the file would appear on precisely one line of \fBtweak\fP's display. .RE .IP "\fB-D\fP" If this option is specified, \fBtweak\fP will not attempt to load and edit a file at all, but will simply produce its default \fB.tweakrc\fP file on standard output. This is a useful way to give yourself a starting point if you want to begin reconfiguring \fBtweak\fP's keyboard layout. .SH "KEYS" .PP This section describes all the editing keys supported by \fBtweak\fP by default. The default key bindings for \fBtweak\fP are basically Emacs-like. .SS "Movement keys" .PP The Emacs cursor movement keys should all work, and their counterparts in ordinary function keys ought to work too: .IP "\fBo\fP" \fB^P\fP and \fB^N\fP go to the previous and next lines; Up and Down should do the same. .IP "\fBo\fP" \fB^B\fP and \fB^F\fP go back and forward one character; Left and Right should do the same. .IP "\fBo\fP" \fBM-v\fP and \fB^V\fP go up and down one screenful at a time; Page Up and Page Down should do the same. .IP "\fBo\fP" \fB^A\fP and \fB^E\fP go to the beginning and end of the line; Home and End should do the same. .PP Press \fBM-<\fP and \fBM->\fP go to the beginning and end of the file. .PP Press \fB^X g\fP to go to a particular byte position in the file; you will be asked to type in the position you want. You can enter it in decimal, or as a hex number with "\fB0x\fP" before it. .SS "Editing keys" .PP Press Return to move the cursor between the hex section of the screen and the ASCII section. .PP When in the hex section, you can enter hexadecimal digits to alter data; when in the ASCII section, you can directly type ASCII text. .PP In ASCII mode, you can also press \fB^Q\fP to literally quote the next input character; for example, if you want to insert a Control-V, you can press \fB^Q^V\fP and \fBtweak\fP will automatically insert the byte value 0x16. .PP Press \fB^X^I\fP, or the Insert key if you have one, to toggle between overwrite mode and insert mode. In insert mode, typing hex or ASCII input will insert new bytes containing the values you provide. Also, you can then press Backspace to delete the byte to the left of the cursor, or \fB^D\fP or Delete to delete the byte under the cursor. .SS "Cut and paste" .PP Press \fB^@\fP (this character may be generated by the key combination Control-@, or Control-2, or Control-Space) to mark the end of a selection region. After you do this, the bytes between that mark and the cursor will be highlighted. Press \fB^@\fP again to abandon the selection. .PP Press \fBM-w\fP while a selection is active to copy the selected region into \fBtweak\fP's cut buffer. .PP In insert mode, you also have the option of pressing \fB^W\fP to \fIcut\fP the selected region completely out of the file and place it in the cut buffer. .PP Finally, press \fB^Y\fP to paste the cut buffer contents back into the file (this will overwrite or insert depending on the current mode). .SS "Searching" .PP Press \fB^S\fP to search for a byte sequence. You will be asked to enter some text to search for on the bottom line of the screen. You can type this text in ASCII, or as a sequence of hex byte values prefixed with backslashes (\fB\\\fP). For example, if you wanted to search for the byte value 5 followed by the word "hello", you might enter \fB\\05hello\fP. If you want to specify a literal backslash character, you can either enter it in hex (as \fB\\5C\fP), or simply double it on input (\fB\\\\\fP). .PP Press \fB^R\fP to search backwards instead of forwards from the current cursor position. .PP Since \fBtweak\fP deals in pure binary data, searches are always case-sensitive. .SS "Controlling the display" .PP If you press \fB^X w\fP, you will be asked to enter a new display width. This has the same effect as passing the \fB-w\fP option on the command line. Similarly, pressing \fB^X o\fP allows you to enter a new display offset, equivalent to the \fB-o\fP option. .PP By default, the current file position and file size are displayed on \fBtweak\fP's status line in hex. If you prefer them in decimal, you can press \fB^X x\fP or \fB^X h\fP to toggle them between hex and decimal. .SS "Miscellaneous" .PP Press \fB^L\fP to redraw the screen and recentre the cursor. Press \fB^Z\fP to suspend \fBtweak\fP and return temporarily to the shell. .PP Press \fB^X^S\fP to save the file you are editing. .PP Press \fB^X^C\fP to exit \fBtweak\fP. (If you do this with changes unsaved, you will be asked whether you want to save them.) .SH "CONFIGURATION FILE" .PP \fBtweak\fP's keyboard bindings are configurable. It will attempt to read a file from your home directory called \fB.tweakrc\fP, and if it finds one it will use the keyboard bindings described in it. If it does not find one, it will use its internal default bindings. .PP Most of the directives in \fB.tweakrc\fP are of the form "\fBbind command-name key\fP". For example, "\fBbind exit ^X^C\fP". Additionally, there are two other directives, \fBwidth\fP and \fBoffset\fP, which give the default display parameters if no \fB-w\fP and \fB-o\fP options are specified. .PP The easiest way to learn about the \fB.tweakrc\fP file is to begin by having \fBtweak\fP output its internal default one: .PP .nf tweak -D > $HOME/.tweakrc .fi .PP Then you can read the default file, learn the \fBtweak\fP internal command names, and edit the file to do what you want. .SH "BUGS" .PP This man page probably ought to contain an explicit list of internal command names, rather than simply referring you to the default \fB.tweakrc\fP. tweak-3.01/btree.html0000644000175300017530000010202510433030011015022 0ustar simonsimon00000000000000 An Efficient Data Structure For A Hex Editor

An Efficient Data Structure For A Hex Editor

by Simon Tatham

1. Introduction

Hex editors have been around for a long time, and at the very basic level they are very simple to write. Since they are mostly used for editing files such as executables, which contain a lot of cross-references to particular byte positions in the file, a hex editor need not have an insert mode in order to be useful. And a hex editor without an insert mode is very easy to implement: you simply allocate a large enough array for the input file, and use that as your data structure. The only operation you really need to be able to do efficiently is to jump to a particular byte position, and that's precisely what an array makes easy.

On the other hand, an insert mode can be useful in other circumstances. Not all types of file you might want to edit have the same restrictions as an executable. And as soon as you want your hex editor to have an insert mode, the data structure question becomes much more interesting.

In this article I present an efficient and scalable data structure which supports all the operations needed by a hex editor.

2. Simple options

One technique used to support insert mode in editors is to use an array larger than the file size, with a gap in it. The file contents up to the current cursor position are stored at the start of the array; the file contents from the current cursor position to the end are stored at the end of the array; and the gap in the middle moves about as the cursor does.

This makes insertion easy. When the user inserts an extra character, you just add it to one end or other of the gap. On the other hand, moving through the file now becomes a slow operation; it's not noticeable when you're moving by a byte, by a line, or even by a screenful at a time, but as soon as you try to jump to the start or end of the file, or jump to a particular specified file offset, suddenly the editor has to bodily shift enormous amounts of file data from one end of the gap to the other.

Another slightly better option is to use a linked list of small arrays, and to let the arrays vary in size between K and 2K bytes, for some fixed minimum block size K. Inserting a single byte in the middle of a block doesn't cost too much; occasionally the block will grow beyond size 2K and have to be split into two smaller ones, but even that isn't too slow.

Jumping to a particular position, however, is still an O(N) operation using this structure. In practice it isn't too bad, since the length of the linked list is at worst 1/K times the size of the file; but once the file size becomes seriously big, this approach does not scale well.

The common problem in both these methods is that as soon as you make insertion a constant-time operation, seeking to a given byte position becomes linear-time. Whereas in the original array format, of course, seeking was constant-time but insertion became linear-time.

3. Using balanced trees

This is where trees come in. Balanced tree structures (any of AVL trees, red-black trees and B-trees) all solve this sort of problem for sorted lists. You can insert an element into a balanced tree in log time, and you can search for a particular element in log time as well. This sounds like the kind of compromise we want: if making insertion constant-time forces seeking to be linear and vice versa, we would prefer to arrange for both to be log-time.

The conventional use of a balanced tree to store a sorted list, however, is not immediately helpful to us. The only criterion we could reasonably sort on would be byte position in the file; and as soon as we store our data as a set of (position, data) pairs, we're back to insertion being linear again, because we would have to alter the position field of every tree element after the insertion point.

Is there anything we can do to our balanced trees to make this work better?

4. Counted trees

Yes, there is.

Suppose you add an additional field to every node of a balanced tree. In that field, you store a count of the number of elements in or below that node.

Operations which alter the tree (insertion and deletion) now have to make sure these counts remain accurate. This can be done without sacrificing the log-time characteristics of the operations. For example, when you add an element, you increment the count of the node containing it, and then work back up the tree to the root incrementing the counts in all the nodes you go past. Since the height of the tree is O(log N), this only takes you O(log N) time.

So we can add counts to a tree and still maintain it efficiently. What have the counts bought us?

Once we have counts in a tree, they introduce an entirely new way to search the tree. Starting at the root, we can search down the tree by examining the count fields rather than comparing elements as usual; and this allows us to find the Nth item in the tree, for any N, in a single log-time search. For example, suppose the root tree node contains a child with count 54, then an actual element, then a child with count 73. Then:

  • If you are trying to get to a position less than 54, then you descend straight to the leftmost child.
  • If you are trying to get to exactly position 54, you return the element out of the root node.
  • If you are trying to get to position 55 or greater, you descend to the rightmost child, and subtract 55 from your desired position. (If you want element 57 of the tree, then you know there are 55 elements in the tree before the right-hand subtree, so you know you want element 2 of the right-hand subtree.)

So now we have a means of finding the Nth item in a tree in a log-time search. This is starting to look promising.

The trouble is, we're still stuck with having some sort of sorting order on the tree. Now we need to deal with that.

5. Unsorted trees

The simple answer to the sorting problem is to do away with sorting the tree at all!

Conventional balanced trees have a sorting order because it's used to find elements in the tree, and to know where to add an element. But we don't need a sorting order to find things any more, because we can use a count-based search to jump to the Nth position. Can we also use counts during the tree add operation, to allow us to specify where we want to add our new element?

We can. Tree add algorithms start by searching down the tree to find the position where the new element will be inserted. If we do this search using counts, in exactly the same way described in section 4, then we can add any element we like at any position in the tree. Once we do this, of course, we have to throw out the sorting order completely, and never do another order-based search or insertion again, because they won't work. But that's OK, because we didn't need them anyway.

Now we have exactly what we were after in the first place. We have a data structure which stores an unordered list of items, in such a way that we can insert or delete an item in log time and find the Nth element in log time.

6. Splitting and joining trees

Now we can begin to get more ambitious. One issue we have not addressed yet is cut and paste.

So far I have discussed tree insertion in the assumption that you only ever insert one character at a time into your tree. In fact hex editors need cut and paste just as much as normal text editors do; so we must think about how to insert or remove a larger block of data at a time.

One obvious way is to process each byte individually. A ten-byte cut operation is ten individual deletions, and a ten-byte paste is ten individual insertions. This is fine if you only ever use cut and paste to move tiny chunks of data around a large file, but if you need to move half the file from one place to another, things get more interesting.

The linked-list structure discussed in section 2 would have helped a lot with this problem. Linked lists don't just make it easy to insert or delete one item: they make it just as easy to unlink an enormous chunk of a list once you've found both ends of the chunk, and you can link that chunk in somewhere else easily as well.

It turns out that you can do the same thing with balanced trees. At this point it starts to make a difference what kind of balanced tree you use: all three of AVL, red-black and B-trees support these operations, but the precise methods vary between them. I'm going to use B-trees from here on, because the algorithms are slightly simpler.

What we need are two basic operations. Given a counted, unsorted B-tree containing an unordered list of items, we need to be able to:

  • Split the tree down the middle, giving two valid B-trees as output.
  • Take two valid B-trees and join them together end-to-end, giving one B-tree containing all the data from tree A followed by the data from tree B.

This will provide all the operations we need. To unlink a large section from the middle of a tree, we split it in two places and then join the outer two parts back together; to link a large section into the middle of a tree, we split it at the insertion point, join the left half on to the left side of the inserted section, and join the right half on to the right side of the inserted section.

6.1. Joining two B-trees together

When you add an element to a B-tree, sometimes it ends up increasing the size of a leaf node beyond the size limit. When that happens, you deal with it by splitting the node in two, and transforming the parent node so that where it previously had a single child pointer, it now has two child pointers with an element between them. If that makes the parent node too big as well, you do the same thing again, and so on until you reach the tree root.

Joining two B-trees is therefore reasonably simple, if you have an additional separating element to place in between them. Position the two trees so that their leaf nodes are at the same level; now (usually) one tree will be shorter than the other. So you can add the root of the shorter tree as a sibling of the node next to it in the taller tree; their common parent gains one extra child pointer (pointing at the root of the shorter tree), separated from its neighbour by the additional separating element. If this causes the node to increase beyond the maximum size, just split it in two and propagate up to its parent, just as in the ordinary insertion process.

If the trees were originally the same height, just combine their root nodes into a single larger root node. You need an extra element to go in between the rightmost child pointer of the left-hand root node, and the leftmost child pointer of the right-hand root node; and again, this is where your separating element comes in. Again, if the new root is too big to be a single node, split it in two and create a new root above it.

So it turns out that it's very easy to join two trees together, but the algorithm requires a spare element to go in the middle. However, we normally don't have such a spare element: we just have two trees. This is easily solved, though: we simply start by removing the leftmost element of the right-hand tree using the ordinary tree deletion algorithm. Then we just do the join algorithm, as described above, using the element we just removed as our separator.

6.2. Splitting a B-tree in two

To split a B-tree in two: we are given a tree, and a means of searching down the tree to find the split point. (In this application, that will be a numeric position, which we check against the node counts on the way down; in other situations, we might perfectly well want to split an ordinary sorted B-tree in half, so we might have an ordering-based search criterion. It makes no difference.)

We start in the simplest possible way. Start at the root node; decide which of its subtree pointers you are going to descend down; and saw the node in half at that subtree pointer. The two half-nodes thus created will each need a subtree pointer to go on the cut end, but that's OK because we're about to saw the next node down in half as well and they can have half each. So descend to the next node, decide on a split point again, saw that node in half, and put a pointer to each half in the two halves of the parent node.

Once we finish this searching-and-cutting pass, we will have successfully separated our tree into two parts at the required point. However, the result will almost certainly not be a pair of valid B-trees; the chances are that many of the nodes on the cut edges will be below the minimum allowed node size. In fact, if at any point our search criterion made us descend through the endmost subtree pointer in any node, some of those nodes will have no elements in them whatsoever, just a single subtree pointer!

So now we must make a healing pass down the cut edge of each tree, to turn it back into a valid B-tree. We can start by throwing away the root node if it has nothing but a single subtree pointer (which will happen quite often if we split near one end of the original tree, since in that case the output trees will almost certainly need to be of different heights). Keep doing that until we find a real root node.

One child of that node is on the cut edge, so it may be below the minimum size. If it is, we solve this using its (valid) neighbour node. If the neighbour is large, we can move some subtrees over into the undersized node to make two correctly sized nodes; if the neighbour is too small and does not have that many subtrees to spare, we can instead combine the undersized node with its neighbour. (And it turns out you can always do at least one of these: if the neighbour is too large to combine with the undersized node, then it must have enough subtrees for redistribution to give two viable nodes.)

The only interesting case is that combining an undersized node with its neighbour reduces the number of subtrees of their common parent by one. Therefore:

  • As we go down, we arrange for each node on the cut edge to be at least one more than minimum size, in case its size must drop by one when we process its child. (This still just about works in all cases.)
  • If the first non-trivial root node had only two children (recall that the root node in a B-tree is the only node exempt from the minimum size limit), and those two children end up having to be combined, then the root node must be thrown away again and the combined node is the new root.

Once we have sorted out each node, we descend to its child on the cut edge, and do the same thing again. Eventually we reach the bottom of the tree and every node is of valid size. Then we do the same thing to the cut edge of the other tree, and we're done.

7. Cloning trees

The splitting and joining algorithms look as if they make cut and paste pretty much trivial. You can split a big chunk out of your editing buffer into a separate cut buffer easily enough; and then you can ‘paste’ it somewhere else by joining it back into the middle of the editing buffer at a different position.

However, in real life, cut and paste isn't that simple. People often want to paste the same data more than once; so you can't just link the cut buffer straight into the editing buffer, because then you don't still have it to link in again next time. You need to copy the cut buffer and link in the copy. Equally, users often want to press Copy rather than Cut, in which case you have to split the buffer tree in two places, copy the middle section, and join all three back together.

Copying a tree, it would seem, is inherently an O(N) operation; there's no way you can copy a tree containing megabytes of data without actually copying all that data.

Or is there?

It turns out that we can do better than this, by adding another annotation field to each tree node. This time, the annotation is a reference count: it counts the number of pointers to the node, either from other tree nodes or from the ‘root’ field in a tree header structure. To begin with, of course, all reference counts are 1.

Reference counts are normally used for garbage collection. In this case, though, I'm going to use them to implement copy-on-write. All of the tree-altering algorithms (insertion and deletion, plus the split and join algorithms described above) will now check the reference count of a node before attempting to modify it. If they find that they need to modify a node with a reference count greater than one, they will not modify it. Instead, they will make a copy of that node, and use the copy in place of the original. The copy links to all the same child nodes as the original, so the reference count in each child must be incremented; and the copied node's parent (or tree header structure) now links to the copy rather than to the original, so the reference count in the original must be decremented. Now we are looking at a node with a reference count of 1, which means nobody else is using it so we can modify it safely.

The effect of this is that it is now a trivial - not merely log-time but constant-time - operation to clone an entire B-tree, no matter how large. We simply create a new tree header structure; we point its root field at the root node of the input tree; and we increment the reference count on that root node.

Once we have cloned a tree like this, we can treat the original and the clone as if they were entirely independent. If you add an element to one of them, for example, then a single string of nodes from the root down to one leaf will be duplicated and modified, but the rest of the trees will still be held in common. You can split either tree into lots of little pieces, or join it into the middle of a larger one, and never affect the data stored in what was once its clone, because every time you touch a node that the other tree is depending on, you make your own copy rather than disturbing it.

This allows us to support really efficient cut and paste in our hex editor. You select a 200Mb chunk and press Copy; the buffer tree is split in two places (in log time), the middle section is cloned (instantly), and the tree is joined back together. You'd hardly know anything was different - but the cut buffer now contains a clone of part of the original buffer, most of which consists of nodes that are still shared with it. And you can paste in as many copies as you like of that chunk, still in no worse than O(log N) time. The best bit is that by the time you've done this a few times and have a file that's 1600Mb longer than it started out, the hex editor isn't actually using up 1600Mb more memory, because most of it is in shared nodes! This technique naturally provides a form of compression as well as being fast.

8. Lazy file loading

In all of the above I have been tacitly assuming that the data elements stored in my tree are individual bytes. This would be hideously inefficient if I were using AVL or red-black trees, in which each node contains precisely one element: for every byte of the file being edited, there would be an overhead of two child pointers, a byte count and a reference count. On a normal 32-bit machine, that's 20 bytes per node, not counting overhead from the memory allocator. A factor of twenty is just ridiculous.

B-trees are a bit more flexible, since they can be made to have a large minimum degree. A B-tree with a minimum node size of (say) 512 can contain up to 1023 bytes of data plus 1024 subtree pointers, and those 1023 bytes can be packed together in memory so the overhead is now more like a factor of five. Also, since no node in a B-tree ever changes its height above ground level, you can just not bother to allocate space for the 512 NULL child pointers in your leaf nodes, and since the vast majority of your nodes will be leaf nodes, the structure is now closer to being space-efficient.

There are other improvements one could make. For example, there's no reason why a B-tree really needs to have the same minimum node degree at every level; so you could have low-degree nodes everywhere above the leaf level, and enormous leaf nodes containing 4-8Kb of file data. You could move to B+ trees in which no actual data elements were stored anywhere except in the leaf nodes, thus saving the tiny alignment overheads in the other nodes.

However, there's a better direction to head in. In section 2 I mentioned the idea of using a linked list as the main data structure, and I said that each element of the linked list would be a smallish array of file bytes (between size K and 2K). There's no reason we couldn't do that in our B-tree-based approach: each element stored in the B-tree is no longer a single byte but a small block of bytes. It would mean that our element counts no longer allowed us to jump to the Nth byte, only to the Nth block; but we can fix that by replacing the element count with a byte count, summing the total size of all the blocks in or below a particular tree node. Now, given any byte position, we can do a single log-time search and return a data block plus an offset within that block.

This technique adds work to all operations. Inserting a byte, for example, is now done by finding the block it needs to go into, inserting it in that block, and potentially splitting the block into two and doing an extra tree operation. Splitting and joining buffers involves splitting and joining blocks at each end, and checking to make sure undersized blocks are not created. So what does this technique buy us, that makes it worthwhile over just storing single bytes in the B-tree?

The answer is: once we have a block data structure as our tree element, we can start having different types of block. In particular, we can have a type of block which is a placeholder, containing nothing but a file offset and length. A block of this type indicates ‘at this point in the tree we have N bytes from position P in the original file’. Blocks of this type are exempt from the normal maximum size for normal literal-data blocks.

The effect of this is that we no longer need to read the entire file into memory when we start up. Instead, we just initialise our tree trivially, so that it contains nothing but a single placeholder block, with offset zero and size equal to the initial file size.

Now whenever we need to read data from the tree, and it turns out the data in question is somewhere in a placeholder block, we must refer back to the original input file in order to find the data (and the placeholder block will tell us where in the file to read it from). So before we do any editing, our hex editor is suddenly a low-cost hex file viewer, which just pages back and forth and refers to the disk all the time.

But as soon as we start altering parts of the file, the placeholder block gets broken up into smaller blocks, and literal-data blocks begin to be created in between them. If we cut and paste a section including a placeholder block, then the tree can end up containing placeholder blocks in a strange order; it might (for example) indicate something like ‘the first 192K of the input file; then the literal bytes 5B 49 A7; then 25K of the input file starting from position 12345; then 512K of the input file starting from position 204325’.

Now the hex editor looks as if it's doing exactly the same thing as it did to begin with. I can page around the original file; I can insert, delete, overwrite, cut, copy and paste to my heart's content, and (provided no other process modifies the original file under our feet) the data I am manipulating will remain consistent at all times with the editing operations I have performed. But there wasn't a big delay at startup when the file was loaded in, because most of it wasn't loaded in; and if I list the running processes on my system, the hex editor will not be using memory proportional to the size of the file. It will only be using memory proportional to the changes I've made to the file.

When I save the file, if there are any placeholder blocks remaining in the buffer tree, the hex editor must write out the new version by referring to the original. This is the only remaining operation, apart from searching, that takes time proportional to the size of the file. And there are no remaining operations which take memory proportional to the size of the file.

(There is one thing you need to be careful of. Literal data blocks must be permitted to fall below the minimum size K if there is no literal block next to them to merge with; in particular, this is vital if you are writing a binary file from scratch or you would never be able to give it a size between zero and K. But this raises the possibility that given a pathological sequence of editing operations, your data structure might end up being an interleaving of one-byte literal blocks and one-byte placeholder blocks, giving a huge space overhead. The simplest solution to this is to impose a minimum size of 2K on placeholder blocks, below which you read the relevant piece of file data and convert them into literal blocks; then they can be merged with adjacent blocks and the worst case is no longer terrible.)

We now have a data structure which does pretty much everything you could reasonably ask a hex editor to be able to do, and does it all at a reasonable memory cost and (apart from the two genuinely necessary operations of searching and saving) all in O(log N) time.

9. Further directions

The data structure as I have presented it is suitable for use in a high-performance hex editor with an insert mode.

There are a couple more points worth noting.

9.1. Conventional text editing

This structure would need only minor modifications to be an efficient basis for a conventional text editor. In order to do this, you would need to be able to jump quickly to a particular line of the file, which means you'd need a node annotation counting newlines.

In fact, it's possible to do slightly better than that: we can devise a more complex node annotation which tracks the effect of an arbitrary byte sequence on the (line, column) position. Assuming that a physical tab character always advances the cursor to the next multiple of 8 spaces, there are three possibilities:

  • A sequence of bytes containing no newlines or tabs simply adds some number A to the column number, and does not affect the line number.
  • A sequence of bytes containing no newlines but at least one tab has the overall effect of adding some number A to the column, and rounding it up to the next number that is congruent to B mod 8.
  • A sequence of bytes containing at least one newline has the effect of adding some number A to the line number, and setting the column number to a fixed value B.

These three function schemas are closed under composition (i.e. combining any two of them gives another one). Storing one in each node of a buffer tree would provide the ability to search directly to a particular (line, column) position in a single log-time search. So the text editor could treat its buffer as a simple sequence of bytes (or possibly of Unicode characters). This is superior to treating the buffer as a sequence of lines, because it removes the distinction between inserting within a line and inserting data between lines. In particular, cut and paste in a line-based model is fiddly because lines must be spliced together at each end of the pasted region; but cut and paste in this model is as trivial as it was in the hex editor - you just cut a sequence of bytes, paste it somewhere else, and the line/column indexing automatically keeps up no matter what you do.

The only snag is that if you did this, you would probably no longer be able to do the trick with placeholder blocks and lazy file loading; a text editor tends to need to know in advance where all the newlines are in its buffer, so there would probably be no alternative to physically loading the file. But in that, at least, this data structure is no worse than any other.

9.2. Supporting undo

An undo function in an editor conceptually stores a sequence of previous buffer states, and allows you to return to one of them when you need to.

Usually, this is not actually implemented by storing copies of the entire buffer, since that would be ludicrously wasteful of space! Instead, a journal of changes is kept which allows previous buffer states to be reconstructed by reversing the precise changes made.

One could do that using this data structure, if one wanted to. However, there's another intriguing option. Since cloning an arbitrarily large tree is a cheap operation, you could implement undo by actually storing a sequence of clones of previous buffer states! The cost of this would be nothing like as bad as it would naïvely appear.

It might still not be ideal, though. Every time you clone a tree and the two clones diverge, several nodes must be copied, and if each node contains several blocks of literal data then the cost of maintaining too many buffer clones might still become prohibitive. But it's an interesting possibility regardless.

10. Summary

I've presented a design for a data structure which implements practically every operation required for a hex editor in O(log N) time, apart from one or two which genuinely need to be O(N).

The structure is:

  • A B-tree, each of whose elements is either a small array of literal data bytes, or a placeholder block denoting a section of the unmodified input file.
  • Each B-tree node is annotated with the total byte count of all the elements in or below that node, allowing a log-time search to pinpoint any numeric byte position.
  • Those counts provide the only necessary means of navigating the tree, so there is no need for a sorting criterion.
  • Split and join algorithms make it possible to link and unlink large chunks from the middle of a buffer at a time.
  • Reference counts implementing copy-on-write allow cloning of chunks in constant time.

As a result:

  • Inserting or deleting bytes in the file is a log-time operation.
  • Finding a particular byte position is a log-time operation.
  • Cut and paste is always log-time, no matter how large or complex the chunk of data being moved around.
  • Memory usage grows proportionally to the changes made to the file, not the overall file size. (However, memory usage is also bounded by a value proportional to the file size, even if you keep editing and re-editing for ever.)

Searching must still be linear (there's no alternative to actually reading the data if you need to know anything about its contents), and saving the modified output file is linear (because you actually must physically write out that much data), but everything else can be done in log time.

I've also sketched a means of converting this into a data structure for an ordinary text editor, and suggested interesting implications in the area of undo operations.

11. References

Donald Knuth's ‘The Art of Computer Programming’ (Addison-Wesley, ISBN 0201485419) presents at least some of the same ideas as this article. Counted and unsorted trees are mentioned in volume 3; splitting and joining are also described (although Knuth does them on AVL trees, which are significantly more fiddly to split than B-trees; you have to cut the tree into lots of little pieces, and then put them all back together by using the join algorithm repeatedly).

‘Tweak’, a hex editor implementing this data structure, can be downloaded at http://www.chiark.greenend.org.uk/~sgtatham/tweak/.


[$Id: btree.but 4828 2004-11-19 18:48:59Z simon $]
tweak-3.01/Makefile0000644000175300017530000000431310433027777014525 0ustar simonsimon00000000000000# Useful options you might want to put on the make command line: # # - `SLANG=yes' to build against libslang instead of libncurses # (libncurses is better and more reliable, but libslang might be # all you have on a particular platform if you're unlucky). # # - `XFLAGS=-DNO_LARGE_FILES' to leave out the 64-bit file access # support (restricts Tweak to editing files under 2Gb, but # should cause it to compile successfully on platforms without # fseeko and ftello and/or long long support). # # - `VERSION=X.XX' (for whatever X.XX you like) to cause the `make # release' target to build a release tarball called # `tweak-X.XX.tar.gz' which unpacks into a directory # `tweak-X.XX'. Note that you also need to modify the version # number in tweak.h, or else the resulting binary won't match # the version number on the archive. CC := gcc CFLAGS := -g -c -Wall $(XFLAGS) LINK := gcc LFLAGS := LIBS := PREFIX=/usr/local BINDIR=$(PREFIX)/bin MANDIR=$(PREFIX)/man/man1 TWEAK := main.o keytab.o actions.o search.o rcfile.o buffer.o btree.o ifeq ($(SLANG),yes) # INCLUDE += -I/path/to/slang/include # LIBS += -L/path/to/slang/lib LIBS += -lslang TWEAK += slang.o else LIBS += -lncurses TWEAK += curses.o endif .c.o: $(CC) $(CFLAGS) $*.c all: tweak tweak.1 btree.html tweak: $(TWEAK) $(LINK) -o tweak $(TWEAK) $(LIBS) tweak.1: manpage.but halibut --man=$@ $< btree.html: btree.but halibut --html=$@ $< # Ensure tweak.h reflects this version number, and then run a # command like `make release VERSION=3.00'. release: tweak.1 btree.html mkdir -p reltmp/tweak-$(VERSION) for i in LICENCE *.c *.h *.but tweak.1 btree.html Makefile; do \ ln -s ../../$$i reltmp/tweak-$(VERSION); \ done (cd reltmp; tar chzvf ../tweak-$(VERSION).tar.gz tweak-$(VERSION)) rm -rf reltmp install: tweak tweak.1 mkdir -p $(BINDIR) install tweak $(BINDIR)/tweak mkdir -p $(MANDIR) install -m 0644 tweak.1 $(MANDIR)/tweak.1 clean: rm -f *.o tweak tweak.1 btree.html main.o: main.c tweak.h keytab.o: keytab.c tweak.h actions.o: actions.c tweak.h search.o: search.c tweak.h rcfile.o: rcfile.c tweak.h buffer.o: buffer.c tweak.h btree.h slang.o: slang.c tweak.h curses.o: curses.c tweak.h btree.o: btree.c btree.h