riak / src / chash.erl

%% @copyright 2007-2008 Basho Technologies

%% @reference Karger, D.; Lehman, E.; Leighton, T.; Panigrahy, R.; Levine, M.; Lewin, D. (1997). "Consistent hashing and random trees". Proceedings of the twenty-ninth annual ACM symposium on Theory of computing: 654~663. ACM Press New York, NY, USA

% @author Justin Sheehy <justin@basho.com>
% @author Andy Gross <andy@basho.com>

% @doc A consistent hashing implementation.
%      The space described by the ring coincides with SHA-1 hashes,
%      and so any two keys producing the same SHA-1 hash are
%      considered identical within the ring.

%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at

%% http://www.apache.org/licenses/LICENSE-2.0

%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% See the License for the specific language governing permissions and
%% limitations under the License.

-author('Justin Sheehy <justin@basho.com>').
-author('Andy Gross <andy@basho.com>').


-define(RINGTOP, trunc(math:pow(2,160)-1)).  % SHA-1 space

% @type chash() = {NumPartitions, [NodeEntry]}
%  NumPartitions = integer()
%  NodeEntry = {IndexAsInt, Node}
%  IndexAsInt = integer()
%  Node = node().
% It is not recommended that code outside this module make use
% of the structure of a chash.

% @type index() = binary().
% Indices into the ring, used as keys for object location, are binary
% representations of 160-bit integers.

% @type node() = term().
% A Node is the unique identifier for the owner of a given partition.
% An Erlang Pid works well here, but the chash module allows it to
% be any term.

% @doc Create a brand new ring.  The size and seednode are specified;
%      initially all partitions are owned by the seednode.  If NumPartitions
%      is not much larger than the intended eventual number of
%       participating nodes, then performance will suffer.
% @spec fresh(NumPartitions :: integer(), SeedNode :: node()) -> chash()
fresh(NumPartitions, SeedNode) ->
    Inc = ?RINGTOP div NumPartitions,
    {NumPartitions, [{IndexAsInt, SeedNode} ||
		   IndexAsInt <- lists:seq(0,(?RINGTOP-1),Inc)]}.

% @doc Find the Node that owns the partition identified by IndexAsInt.
% @spec lookup(IndexAsInt :: integer(), CHash :: chash()) -> node()
lookup(IndexAsInt, CHash) ->
    {_NumPartitions, Nodes} = CHash,
    {IndexAsInt, X} = proplists:lookup(IndexAsInt, Nodes),

% @doc Return true if named Node owns any partitions in the ring, else false.
% @spec contains_name(Name :: node(), CHash :: chash()) -> bool()
contains_name(Name, CHash) ->
    {_NumPartitions, Nodes} = CHash,
    case [X || {_,X} <- Nodes, X == Name] of
	[] ->
	_ ->

% @doc Make the partition beginning at IndexAsInt owned by Name'd node.
% @spec update(IndexAsInt :: integer(), Name :: node(), CHash :: chash())
%                -> chash()
update(IndexAsInt, Name, CHash) ->
    {NumPartitions, Nodes} = CHash,
    GapList = [{I,N} || {I,N} <- Nodes, I /= IndexAsInt],
    {A, B} = lists:partition(fun({K1,_}) -> K1 < IndexAsInt end, GapList),
    {NumPartitions, A ++ [{IndexAsInt,Name}] ++ B}.

% @doc Given an object key, return all NodeEntries in order starting at Index.
% @spec successors(Index :: index(), CHash :: chash()) -> [NodeEntry]
successors(Index, CHash) ->
    {NumPartitions, _Nodes} = CHash,
    successors(Index, CHash, NumPartitions).
% @doc Given an object key, return the next N NodeEntries in order
%      starting at Index.
% @spec successors(Index :: index(), CHash :: chash(), N :: integer())
%                                                             -> [NodeEntry]
successors(Index, CHash, N) ->
    Num = max_n(N, CHash),
    {Res, _} = lists:split(Num, ordered_from(Index, CHash)),

% @doc Given an object key, return all NodeEntries in reverse order
%      starting at Index.
% @spec predecessors(Index :: index(), CHash :: chash()) -> [NodeEntry]
predecessors(Index, CHash) ->
    predecessors(Index, CHash, length(CHash)).
% @doc Given an object key, return the next N NodeEntries in reverse order
%      starting at Index.
% @spec predecessors(Index :: index(), CHash :: chash(), N :: integer())
%                                                             -> [NodeEntry]
predecessors(Index, CHash, N) ->
    Num = max_n(N, CHash),
    {Res, _} = lists:split(Num, lists:reverse(ordered_from(Index,CHash))),

% @doc Return either N or the number of partitions in the ring, whichever
%      is greater.
% @spec max_n(N :: integer(), CHash :: chash()) -> integer()
max_n(N, {NumPartitions, _Nodes}) ->
	N > NumPartitions ->
	true ->

% @doc Given an object key, return all NodeEntries in order starting at Index.
% @spec ordered_from(Index :: index(), CHash :: chash()) -> [NodeEntry]
ordered_from(Index, {NumPartitions, Nodes}) ->
    <<IndexAsInt:160/integer>> = Index,
    Inc = ?RINGTOP div NumPartitions,
    {A, B} = lists:split((IndexAsInt div Inc)+1, Nodes),
    B ++ A.

% @doc Given any term used to name an object, produce that object's key
%      into the ring.  Two names with the same SHA-1 hash value are
%      considered the same name.
% @spec key_of(ObjectName :: term()) -> index()
key_of(ObjectName) ->    

% @doc Return all Nodes that own any partitions in the ring.
% @spec members(CHash :: chash()) -> [Node]
members(CHash) ->
    {_NumPartitions, Nodes} = CHash,
    lists:usort([X || {_Idx,X} <- Nodes]).

% @doc Return the entire set of NodeEntries in the ring.
% @spec nodes(CHash :: chash()) -> [NodeEntry]
nodes(CHash) ->
    {_NumPartitions, Nodes} = CHash,

% @doc Return the best possible merge of two rings based on a vclock.
%      If multiple nodes are actively claiming nodes in the same
%      time period, some churn can occur.
% @spec merge_rings(VClock :: vclock:vclock(),
%                   CHashA :: chash(), CHashB :: chash()) -> chash()
merge_rings(VClock,CHashA,CHashB) ->
    {NumPartitions, NodesA} = CHashA,
    {NumPartitions, NodesB} = CHashB,
    {NumPartitions, [{I,recentnode(VClock,A,B)} || 
		   {{I,A},{I,B}} <- lists:zip(NodesA,NodesB)]}.

% @spec recentnode(VClock :: vclock:vclock(), NodeA :: node(),
%                   NodeB :: node()) -> node()
recentnode(VClock,NodeA,NodeB) ->
    % note: this can result in node "theft" -- this is okay!
    % the idea is to gossip often, leading to convergence.
    CtrA = vclock:get_counter(NodeA,VClock),
    CtrB = vclock:get_counter(NodeB,VClock),
	CtrA > CtrB ->
	true ->

% @doc Return the number of partitions in the ring.
% @spec size(CHash :: chash()) -> integer()
size(CHash) ->
    {NumPartitions,_Nodes} = CHash,
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.