+{-# LANGUAGE RecordWildCards #-}

+{-# LANGUAGE TupleSections #-}

+-- | Attempt to implement the community detection algorithm of

+-- Ahn, Bagrow and Lehmann,

+-- Link communities reveal multiscale complexity in networks,

+-- Nature (doi:10.1038/nature09182),

+-- <http://www.nature.com/nature/journal/v466/n7307/abs/nature09182.html>.

+-- See <http://barabasilab.neu.edu/projects/linkcommunities/>.

+-- TODO: convert to a more-standard Graph representation.

+-- TODO: support weights

+import qualified Data.Map as M

+import qualified Data.Set as S

+import Control.Monad.State.Strict

+import Data.List (foldl', sortBy)

+import Data.Maybe (fromMaybe)

+import Data.Ratio (Ratio, (%))

+eLookup :: (Ord k, Show k) => String -> k -> M.Map k a -> a

+eLookup emsg k = fromMaybe (error ("*Internal error* " ++ emsg ++ " " ++ show k)) . M.lookup k

+-- | Represent a node. Create and extract with

+-- `toNodeLabel` and `fromNodeLabel` respectively.

+data NodeLabel = NL { fromNodeLabel :: Int }

+ deriving (Eq, Ord, Show)

+-- | Create a `NodeLabel`.

+toNodeLabel :: Int -> NodeLabel

+-- | A pair of nodes representing an undirected connection.

+-- Use `toNodePair` and `fromNodePair` to convert

+-- to and from a @Nodepair@.

+-- Note that @fromNodePair (toNodePair n1 n2)@ is

+-- @(min n1 n2, max n1 n2)@.

+-- Since the connections are undirected we

+-- ensure that the first node is "before"

+-- the second, and we do not allow loops (i.e.

+-- a connection between the same node).

+data NodePair = NP { fromNodePair :: (NodeLabel, NodeLabel) }

+ deriving (Eq, Ord, Show)

+-- | Create a connection between two nodes.

+-- The routine will fail if the two nodes are the same.

+toNodePair :: NodeLabel -> NodeLabel -> Maybe NodePair

+toNodePair n1 n2 = case compare n1 n2 of

+ LT -> Just $ NP (n1,n2)

+ GT -> Just $ NP (n2,n1)

+unsafeToNP :: NodeLabel -> NodeLabel -> NodePair

+ (error ("*Internal error* unable to convert nodes " ++ show n1 ++ "," ++ show n2 ++ " into a pair"))

+-- | The edges in an undirected graph with no loops (i.e. no node

+-- connected to itself). Create with

+-- `toEdges` and extract with `fromEdges`.

+data Edges = EG { fromEdges :: S.Set NodePair }

+toEdges :: S.Set NodePair -> Edges

+-- | Connections between nodes. If @n2@ is a member of the nodes

+-- connected to @n1@ then @n1@ will be a member of the nodes

+type AdjacencyList = M.Map NodeLabel (S.Set NodeLabel)

+toAL :: Edges -> AdjacencyList

+ let addToMap omap (NP (u1,u2)) = M.insertWith' S.union u1 (S.singleton u2) $ M.insertWith' S.union u2 (S.singleton u1) omap

+ in S.foldl' addToMap M.empty edges

+-- | Return a map of the neighbors of a node, including itself.

+getNeighbours :: AdjacencyList -> M.Map NodeLabel (S.Set NodeLabel)

+getNeighbours = M.mapWithKey S.insert

+-- | Return all pairs from the set (assuming an undirected graph).

+-- The output is not guaranteed to be ordered.

+getPairs :: S.Set NodeLabel -> [NodePair]

+getPairs = map (uncurry unsafeToNP) . pairUp . S.toAscList

+-- | Return pairs of elements, so that

+-- @pairUp [1,2,3,4] == [(1,2), (1,3), (1,4), (2,3), (2,4), (3,4)]@.

+-- If (1,2) is in the output then we do not want (2,1); we

+-- assume that the input list is ordered and unique.

+pairUp :: [a] -> [(a, a)]

+pairUp (x:xs) = map (x,) xs ++ pairUp xs

+-- It is posible that the density calculation can overflow when using Int

+-- so switch to Integer. Perhaps I should just switch to Double instead?

+-- Since the similarity is calculated as a ratio of set sizes, which are

+-- both Ints, we leave this ratio as is.

+type Similarity = Ratio Int

+type Density = Ratio Integer

+-- | It seems like this should be a Max Heap, since we are interested

+-- in those edges with higher similarity. For now just use a map.

+type SimMap = M.Map Similarity (S.Set (NodePair, NodePair))

+-- | Calculate the edge similarites, where the output is

+-- ordered by similarity. Since there are no weights

+-- we use the Jaccard index as the similarity measure.

+-- See section 2.1.1 of the Suplementary Information of

+-- Ahn, Bagrow, & Lehman.

+edgeSimilarities :: AdjacencyList -> SimMap

+ let ns = getNeighbours amap

+ -- remove those nodes that are only connected to one other node,

+ -- so they can not be a keystone node

+ filtMap = M.filter (\s -> S.size s > 1) amap

+ -- get all the impost nodes for each keystone node

+ pairsMap = M.map getPairs filtMap

+ -- calculate the similarity of this node triple

+ -- (two impost nodes which are both connected to

+ NodeLabel -- ^ the keystone node

+ -> SimMap -- ^ input similarity map

+ -> NodePair -- ^ the impost nodes

+ -> SimMap -- ^ updated similarity map

+ procPair n orig (NP (u1,u2)) =

+ let orderPair :: (Ord a) => (a, a) -> (a, a)

+ orderPair o@(a,b) = if a > b then (b,a) else o

+ edge = orderPair (unsafeToNP u1 n, unsafeToNP u2 n)

+ emsg = "missing node connection for node " ++ show n

+ getSet u = eLookup emsg u ns

+ n1 = getSet u1 -- ^ inclusive neighbors (n_+(i))

+ n2 = getSet u2 -- ^ inclusive neighbors (n_+(j))

+ -- calculate the Jaccard index

+ sim = S.size (n1 `S.intersection` n2) % S.size (n1 `S.union` n2)

+ in M.insertWith' S.union sim (S.singleton edge) orig

+ -- process all pairs of impost nodes for the given keystone node.

+ proc :: SimMap -> NodeLabel -> [NodePair] -> SimMap

+ proc orig n = foldl' (procPair n) orig

+ in M.foldlWithKey' proc M.empty pairsMap

+ -- ^ this is the partition density (equation 3)

+ -- *without* the leading 2/M term

+ , psCommunity :: M.Map NodePair Community

+ -- ^ what community is this edge in?

+ , psEdges :: M.Map Community (S.Set NodePair)

+ -- ^ what edges are in the community?

+ , psNodes :: M.Map Community (S.Set NodeLabel)

+ -- ^ what nodes are in the community?

+-- | What community does the node pair belong to? We assume that

+-- the edge exists in the graph.

+getCommunity :: PartitionState -> NodePair -> Community

+getCommunity PartitionState {..} edge =

+ eLookup "unknown edge" edge psCommunity

+-- | What edges belong to the community? We assume that the

+getCommunityEdges :: PartitionState -> Community -> S.Set NodePair

+getCommunityEdges PartitionState {..} c =

+ eLookup "unknown community (edges)" c psEdges

+-- | What nodes belong to the community? We assume that the

+getCommunityNodes :: PartitionState -> Community -> S.Set NodeLabel

+getCommunityNodes PartitionState {..} c =

+ eLookup "unknown community (nodes)" c psNodes

+-- | What is the link density for the community (given by the

+-- edges in the community and the nodes in the community).

+-- We calculate the per-community part of Equation 3, rather

+-- than equation 2, since we wish to calculate the partition

+-- density. The normalisation by 2/M is done elsewhere.

+calcCommunityDensity :: S.Set NodePair -> S.Set NodeLabel -> Density

+calcCommunityDensity es ns =

+ let m = fromIntegral $ S.size es -- number of links in the community

+ n = fromIntegral $ S.size ns -- number of induced nodes

+ in if n > 2 then m * (m - n + 1) / ((n - 2) * (n - 1)) else 0

+-- Merge the two communities, attempting to merge smaller into larger

+-- as that is more efficient for S.union (as it uses the

+-- hedge-union algorithm).

+ PartitionState -- ^ input state

+ -> Community -- ^ community to merge

+ -> Community -- ^ community to merge

+ -> PartitionState -- ^ merged communities (and the id of the smaller one removed)

+doCommunityMerge ps@(PartitionState {..}) c1 c2 =

+ -- x1 and x2 refer to properties of community 1 and 2.

+ -- x1' and x2' are the same but this time we have ensured that

+ -- community 1' is larger than 2' (for merging)

+ -- values without a suffix or ' are intended to be the

+ let e1 = getCommunityEdges ps c1

+ e2 = getCommunityEdges ps c2

+ -- to ensure that the unions are efficient, we want the

+ (c1', c2', e1', e2') = if S.size e2 > S.size e1 then (c2, c1, e2, e1) else (c1, c2, e1, e2)

+ edges = e1' `S.union` e2'

+ -- merge the nodes from the newly-added edges into the node set for this

+ n1' = getCommunityNodes ps c1'

+ n2' = getCommunityNodes ps c2' -- only needed for the density calculation

+ nodeList2' = S.foldl' (\s (NP (n1,n2)) -> S.insert n1 $ S.insert n2 s) S.empty e2'

+ nodes = if S.size n1' >= S.size nodeList2' then n1' `S.union` nodeList2' else nodeList2' `S.union` n1'

+ -- update the community map to move all the edges in c2' into c1'

+ moveCommunity :: M.Map NodePair Community -> NodePair -> M.Map NodePair Community

+ moveCommunity orig e = M.adjust (const c1') e orig

+ cmap = S.foldl' moveCommunity psCommunity e2'

+ emap = M.delete c2' $ M.adjust (const edges) c1' psEdges

+ nmap = M.delete c2' $ M.adjust (const nodes) c1' psNodes

+ -- add in the density for the new community and remove the

+ -- contribution from the old communities.

+ deltaDensity = calcCommunityDensity edges nodes

+ - calcCommunityDensity e1' n1'

+ - calcCommunityDensity e2' n2'

+ in PartitionState (psDensity + deltaDensity) cmap emap nmap

+type CommunityState = State PartitionState

+-- | If the two edges are not in the same community, merge the two

+-- communities, otherwise do nothing.

+mergeEdgeCommunities :: (NodePair, NodePair) -> CommunityState ()

+mergeEdgeCommunities (e1,e2) = do

+ let c1 = getCommunity ps e1

+ c2 = getCommunity ps e2

+ unless (c1 == c2) $ put $ doCommunityMerge ps c1 c2

+-- | Merge the edges at the given similarity level.

+ Density -- ^ normalization term (2/M) of equation 3

+ -> (Similarity, S.Set (NodePair, NodePair)) -- ^ the node triples for the given similarity level

+ -> CommunityState (Similarity, Density, M.Map Community (S.Set NodePair))

+ -- ^ The similarity, density *before* the merge, and community map

+mergePairs norm (sim, pairs) = do

+ PartitionState {..} <- get

+ mapM_ mergeEdgeCommunities $ S.toList pairs

+ return (sim, norm * psDensity, psEdges)

+-- | Create the starting state, where each edge belongs to its own

+startEdges :: S.Set NodePair -> (M.Map NodePair Community, M.Map Community (S.Set NodePair), M.Map Community (S.Set NodeLabel))

+ let cmap = M.fromAscList $ zip (S.toAscList es) [1..]

+ nodepair om np comm = M.insertWith' const comm (S.singleton np) om

+ nodelabel om (NP (n1,n2)) comm = M.insertWith' const comm (S.fromList [n1,n2]) om

+ f g = M.foldlWithKey' g M.empty cmap

+ in (cmap, f nodepair, f nodelabel)

+-- | Return (a,b) pairs and the values at the peak.

+-- This is very ugly. Is it really worth it?

+ -> Maybe ([(a, b)], (a,b), c)

+findPeak ((a1,b1,c1):xs) =

+ let go [] rs amax bmax cmax = (reverse rs, (amax,bmax), cmax)

+ go ((a,b,c):os) rs amax bmax cmax | b >= bmax = go os ((a,b):rs) a b c

+ | otherwise = go os ((a,b):rs) amax bmax cmax

+ {- If we want to find the first "peak"

+ let go2 [] rs' = (reverse rs', (amax,bmax), cmax)

+ go2 ((a',b',_):os') rs' = go2 os' ((a',b'):rs')

+ in Just $ go xs [(a1,b1)] a1 b1 c1

+-- | Create the partitions given the similarities. To restrict to a given

+-- threshold then @takeWhile ((>=threshold). fst)@ the input list.

+-- This may not handle all the edge cases you can get.

+-- The results do not quite match the Python code from

+-- http://barabasilab.neu.edu/projects/linkcommunities/, primarily in that

+-- their results seem to include an extra (1.0,0.0) at the head of the

+-- Similarity/Density list.

+-- Note that the community labels (here integers) are not guaranteed to

+-- match that of the Ahn et al. code.

+ -> SimMap -- ^ similarity values for all node triples in the graph

+ -> Maybe ([(Similarity, Density)], (Similarity,Density), M.Map Community (S.Set NodePair))

+ -- ^ returns Nothing for an empty graph, otherwise a list of the density values

+ -- as a function of similarity, the best (Similarity, Density), and the

+ -- community map at this point.

+mergeSimilarities (EG edges) simMap =

+ -- I think I am missing a trick here, in that there is a "shift" between the work done

+ -- in a loop and the values that are returned from the loop (e.g. note the *before*

+ -- caveats in mergePairs) and the need to augment the return values with a 0-similarity

+ let (icoms, iedges, inodes) = startEdges edges

+ s0 = PartitionState 0 icoms iedges inodes

+ norm = 2 % toInteger (S.size edges)

+ (xs, s1) = runState (mapM (mergePairs norm) (M.toDescList simMap)) s0

+ in findPeak $ xs ++ [(0.0, norm * psDensity s1, psEdges s1)]

+-- Reverse the order of the sort (at present I do not have a modern

+-- enough base to have Data.Ord.Down).

+rcomparing :: Ord b => (a,b) -> (a,b) -> Ordering

+rcomparing (_,b1) (_,b2) | b1 > b2 = LT

+-- Renumber the communities so that

+-- a) go from 1 to ncommunities

+-- b) they are ordered by number of edges in the community (in

+renumber :: M.Map Community (S.Set b) -> M.Map Community (S.Set b)

+ let oldcomms = map fst $ sortBy rcomparing $ M.assocs $ M.map S.size orig

+ -- for now assume the number of communities is small enough that

+ -- an association list is sufficient

+ conv = zip oldcomms [1..]

+ keyConv k = fromMaybe (error ("*Internal error* missing key " ++ show k)) $ lookup k conv

+ in M.mapKeys keyConv orig

+-- | Given a set of edges, return the community information at the optimal theshold.

+-- This is an implementation of the algorithm described in

+-- Ahn, Bagrow and Lehmann,

+-- Link communities reveal multiscale complexity in networks,

+-- Nature (doi:10.1038/nature09182).

+-- At present only unweighted and undirected graphs are supported.

+ Edges -- ^ The input graph.

+ -> Maybe ([(Similarity, Density)], (Similarity,Density), M.Map Community (S.Set NodePair))

+ -- ^ returns Nothing for an empty graph, otherwise a list of the density values

+ -- as a function of similarity, the best (Similarity, Density), and the

+ -- community map at this point. The communities are numbered by descending

+ -- number of edges in the community (with community nnumbers starting at 1).

+ let sim = edgeSimilarities $ toAL es

+ in case mergeSimilarities es sim of

+ Just (a,b,c) -> Just (a, b, renumber c)

+hack :: Int -> Int -> NodePair

+hack a b = unsafeToNP (NL a) (NL b)

+edges1, edges2, edges3, edges4 :: Edges

+edges1 = EG $ S.fromList [p12, p13, p24, p310, p41, p42]

+edges2 = EG $ S.fromList [p12, p13, p24, p310, p41, p42, p410, p32]

+edges3 = EG $ S.fromList [p12, p13, p54, p67]

+edges4 = EG $ S.fromList [p12, p54, p67]

+-- al, al2, al3 :: AdjacencyList