Hibari Overview, Part II

Scott Lystig Fritchie, slfritchie@snookles.com

August 25, 2010









































Check Out This URL

Architecture Layers


Bricks: Physical and logical


Bottom Layer: basic brick API


Basic Brick

  • Knows nothing about other bricks
  • Manages its own RAM and disk data structures
  • Uses binary tree to preserve keys' lexicographic sorting order
  • ... consistent hashing breaks cluster-wide sort order ...

    Basic Brick API: Basic UBF Data Types

    table()           = atom(nonempty,nonundefined);
    key()             = binary();
    %% ts = timestamp = usually an int64: time_t * 1000000 + usecs
    ts()              = integer();
    val()             = binary();
    exp_time()        = time_t();
    flags_list()      = [do_op_flag()];
    do_op_flag()      = {testset, ts()} |
                        witness |
                        get_all_attribs |
                        %% Flags for get_many
                        {max_num, integer()} |
                        {binary_prefix, binary()} |
                        must_exist |
                        must_not_exist |
                        value_in_ram |
                        ....
    

    Basic Brick API: modify ops

    add()             = {add,     key(), ts(), val(), exp_time(), flags_list()};
    replace()         = {replace, key(), ts(), val(), exp_time(), flags_list()};
    set()             = {set,     key(), ts(), val(), exp_time(), flags_list()};
    
    add_res()         = do1_res_ok() | do1_res_fail();
    
    do1_res_ok()      = ok |
                        key_not_exist |
                        {ok, ts()} |
                        {ok, ts(), val()} |
                        ....
    
    do1_res_fail()    = {key_exists, ts()} |
                        key_not_exist |
                        {ts_error, ts()} |
                        invalid_flag_present |
                        %% The responsible brick is unavailable/crashed/whatever.
                        brick_not_available;
    

    Basic Brick API: read-only ops

    get()             = {get,      key(), flags_list()};
    get_many()        = {get_many, key(), flags_list()};
    
    get_res()         = key_not_exist |
                        {ok, ts()} |
                        {ok, ts(), val()} |
                        {ok, ts(), val(), time_t(), flags_list()}
                        ....
    get_many_res() = {ok, {[{key(), ts()}], boolean()}} |
                     {ok, {[{key(), ts(), flags_list()}], boolean()}} |
                     {ok, {[{key(), ts(), val(), time_t(), flags_list()}], boolean()}} |
    

    Micro-Transactions

    Valid micro-transaction: all keys managed by same chain

        [txn,
         {op = replace, key = "string1", value = "Hello, world!"},
         {op = delete, key = "string4"}
        ]
    

    Invalid micro-transaction: keys managed by different chains

        [txn,
         {op = replace, key = "string1", value = "Hello, world!"},
         {op = delete, key = "string2"}
        ]
    

    Actually Useful Micro-transactions


    Example Micro-Transaction

    Imagine a table called 'posts'


    Sample key Data stored in value blob
    /42/1 Text of post #1
    /42/1/1 Text of comment #1 on post #1
    /42/1/2 Text of comment #2 on post #1
    /42/2 Text of post #2
    /42/summary Next post number, number of active posts, number of deleted posts, . . .

    add_new_post(UserID, PostText) ->
      Prefix = "/" ++ integer_to_list(UserID) ++ "/",
      MetaKey = Prefix ++ "summary",
      {ok, OldTS, OldVal} = brick_simple:get(posts, MetaKey),
    
      #post{next_id = NextID, active = Active} =
          OldMeta = binary_to_term(OldVal),
      PostKey = Prefix ++ integer_to_list(NextID),
      NewMeta = OldMeta#post{next_id = NextID + 1,
                             active = Active + 1},
    
      %% replace op: Abort if the key does not exist
      %%             or if current timestamp /= OldTS.
      %% add op: Abort if the key already exists.
      Txn = [brick_server:make_txn(),
             brick_server:make_replace(MetaKey, term_to_binary(NewMeta),
                                       0, [{testset, OldTS}]),
             brick_server:make_add(PostKey, PostText)],
      [ok, ok] = brick_simple:do(posts, Txn).
    

    The Admin Server