-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Fast algorithms for single, average/UPGMA and complete linkage clustering.
--   
--   This package provides a function to create a dendrogram from a list of
--   items and a distance function between them. Initially a singleton
--   cluster is created for each item, and then new, bigger clusters are
--   created by merging the two clusters with least distance between them.
--   The distance between two clusters is calculated according to the
--   linkage type. The dendrogram represents not only the clusters but also
--   the order on which they were created.
--   
--   This package has many implementations with different performance
--   characteristics. There are SLINK and CLINK algorithm implementations
--   that are optimal in both space and time. There are also naive
--   implementations using a distance matrix. Using the <tt>dendrogram</tt>
--   function from <tt>Data.Clustering.Hierarchical</tt> automatically
--   chooses the best implementation we have.
--   
--   Changes in version 0.4:
--   
--   <ul>
--   <li>Specialize the distance type to Double for efficiency reasons.
--   It's uncommon to use distances other than Double.</li>
--   <li>Implement SLINK and CLINK. These are optimal algorithms in both
--   space and time for single and complete linkage, respectively, running
--   in <i>O(n^2)</i> time and <i>O(n)</i> space.</li>
--   <li>Reorganized internal implementation.</li>
--   <li>Some performance improvements for the naive implementation.</li>
--   <li>Better test coverage. Also, performance improvements for the test
--   suite, now running in 3 seconds (instead of one minute).</li>
--   </ul>
--   
--   Changes in version 0.3.1.2 (version 0.3.1.1 was skipped):
--   
--   <ul>
--   <li>Added tests for many things. Use <tt>cabal test</tt> =).</li>
--   </ul>
--   
--   Changes in version 0.3.1:
--   
--   <ul>
--   <li>Works with containers 0.4 (thanks, Doug Beardsley).</li>
--   <li>Removed some internal unnecessary overheads and added some
--   strictness.</li>
--   </ul>
--   
--   Changes in version 0.3.0.1:
--   
--   <ul>
--   <li>Listed changes of unreleased version 0.2.</li>
--   </ul>
--   
--   Changes in version 0.3:
--   
--   <ul>
--   <li>Added function <tt>cutAt</tt>.</li>
--   <li>Fixed complexity in Haddock comments.</li>
--   </ul>
--   
--   Changes in version 0.2:
--   
--   <ul>
--   <li>Added function <tt>elements</tt>.</li>
--   <li>Added separate functions for each linkage type. This may be useful
--   if you want to create a dendrogram and your distance data type isn't
--   an instance of <tt>Floating</tt>.</li>
--   </ul>
@package hierarchical-clustering
@version 0.4.7

module Data.Clustering.Hierarchical.Internal.Types

-- | Data structure for storing hierarchical clusters. The distance between
--   clusters is stored on the branches. Distances between leafs are the
--   distances between the elements on those leafs, while distances between
--   branches are defined by the linkage used (see <a>Linkage</a>).
data Dendrogram a

-- | The leaf contains the item <tt>a</tt> itself.
Leaf :: a -> Dendrogram a

-- | Each branch connects two clusters/dendrograms that are <tt>d</tt>
--   distance apart.
Branch :: {-# UNPACK #-} !Distance -> Dendrogram a -> Dendrogram a -> Dendrogram a

-- | The linkage type determines how the distance between clusters will be
--   calculated. These are the linkage types currently available on this
--   library.
data Linkage

-- | The distance between two clusters <tt>a</tt> and <tt>b</tt> is the
--   <i>minimum</i> distance between an element of <tt>a</tt> and an
--   element of <tt>b</tt>.
SingleLinkage :: Linkage

-- | The distance between two clusters <tt>a</tt> and <tt>b</tt> is the
--   <i>maximum</i> distance between an element of <tt>a</tt> and an
--   element of <tt>b</tt>.
CompleteLinkage :: Linkage

-- | The same as <a>CompleteLinkage</a>, but using the CLINK algorithm.
--   It's much faster however doesn't always give the best complete linkage
--   dendrogram.
CLINK :: Linkage

-- | Unweighted Pair Group Method with Arithmetic mean, also called
--   "average linkage". The distance between two clusters <tt>a</tt> and
--   <tt>b</tt> is the <i>arithmetic average</i> between the distances of
--   all elements in <tt>a</tt> to all elements in <tt>b</tt>.
UPGMA :: Linkage

-- | This method is usually wrongly called "average linkage". The distance
--   between cluster <tt>a = a1 U a2</tt> (that is, cluster <tt>a</tt> was
--   formed by the linkage of clusters <tt>a1</tt> and <tt>a2</tt>) and an
--   old cluster <tt>b</tt> is <tt>(d(a1,b) + d(a2,b)) / 2</tt>. So when
--   clustering two elements to create a cluster, this method is the same
--   as UPGMA. However, in general when joining two clusters this method
--   assigns equal weights to <tt>a1</tt> and <tt>a2</tt>, while UPGMA
--   assigns weights proportional to the number of elements in each
--   cluster. See, for example:
--   
--   <ul>
--   
--   <li><a>http://www.cs.tau.ac.il/~rshamir/algmb/00/scribe00/html/lec08/node21.html</a>,
--   which defines the real UPGMA and gives the equation to calculate the
--   distance between an old and a new cluster.</li>
--   
--   <li><a>http://github.com/JadeFerret/ai4r/blob/master/lib/ai4r/clusterers/average_linkage.rb</a>,
--   code for "average linkage" on ai4r library implementing what we call
--   here <tt>FakeAverageLinkage</tt> and not UPGMA.</li>
--   </ul>
FakeAverageLinkage :: Linkage

-- | A distance is simply a synonym of <a>Double</a> for efficiency.
type Distance = Double
instance GHC.Show.Show a => GHC.Show.Show (Data.Clustering.Hierarchical.Internal.Types.Dendrogram a)
instance GHC.Classes.Ord a => GHC.Classes.Ord (Data.Clustering.Hierarchical.Internal.Types.Dendrogram a)
instance GHC.Classes.Eq a => GHC.Classes.Eq (Data.Clustering.Hierarchical.Internal.Types.Dendrogram a)
instance GHC.Enum.Enum Data.Clustering.Hierarchical.Internal.Types.Linkage
instance GHC.Show.Show Data.Clustering.Hierarchical.Internal.Types.Linkage
instance GHC.Classes.Ord Data.Clustering.Hierarchical.Internal.Types.Linkage
instance GHC.Classes.Eq Data.Clustering.Hierarchical.Internal.Types.Linkage
instance GHC.Base.Functor Data.Clustering.Hierarchical.Internal.Types.Dendrogram
instance Data.Foldable.Foldable Data.Clustering.Hierarchical.Internal.Types.Dendrogram
instance Data.Traversable.Traversable Data.Clustering.Hierarchical.Internal.Types.Dendrogram


-- | Implementations that are optimal in space and time.
module Data.Clustering.Hierarchical.Internal.Optimal

-- | <i>O(n^2)</i> time and <i>O(n)</i> space. Calculates a complete,
--   rooted dendrogram for a list of items using single linkage with the
--   SLINK algorithm. This algorithm is optimal in space and time.
--   
--   <ul>
--   <li><i>Reference</i> R. Sibson (1973). "SLINK: an optimally efficient
--   algorithm for the single-link cluster method". <i>The</i> <i>Computer
--   Journal</i> (British Computer Society) 16 (1): 30-34.</li>
--   </ul>
singleLinkage :: [a] -> (a -> a -> Distance) -> Dendrogram a

-- | <i>O(n^2)</i> time and <i>O(n)</i> space. Calculates a complete,
--   rooted dendrogram for a list of items using complete linkage with the
--   CLINK algorithm. This algorithm is optimal in space and time.
--   
--   <ul>
--   <li><i>Reference</i> D. Defays (1977). "An efficient algorithm for a
--   complete link method". <i>The Computer Journal</i> (British Computer
--   Society) 20 (4): 364-366.</li>
--   </ul>
completeLinkage :: [a] -> (a -> a -> Distance) -> Dendrogram a

module Data.Clustering.Hierarchical.Internal.DistanceMatrix

-- | <i>O(n^3)</i> time and <i>O(n^2)</i> space. Calculates a complete,
--   rooted dendrogram for a list of items using single linkage with the
--   naïve algorithm using a distance matrix.
singleLinkage :: [a] -> (a -> a -> Distance) -> Dendrogram a

-- | <i>O(n^3)</i> time and <i>O(n^2)</i> space. Calculates a complete,
--   rooted dendrogram for a list of items using complete linkage with the
--   naïve algorithm using a distance matrix.
completeLinkage :: [a] -> (a -> a -> Distance) -> Dendrogram a

-- | <i>O(n^3)</i> time and <i>O(n^2)</i> space. Calculates a complete,
--   rooted dendrogram for a list of items using UPGMA with the naïve
--   algorithm using a distance matrix.
upgma :: [a] -> (a -> a -> Distance) -> Dendrogram a

-- | <i>O(n^3)</i> time and <i>O(n^2)</i> space. Calculates a complete,
--   rooted dendrogram for a list of items using fake average linkage with
--   the naïve algorithm using a distance matrix.
fakeAverageLinkage :: [a] -> (a -> a -> Distance) -> Dendrogram a
instance GHC.Show.Show Data.Clustering.Hierarchical.Internal.DistanceMatrix.Cluster
instance GHC.Classes.Ord Data.Clustering.Hierarchical.Internal.DistanceMatrix.Cluster
instance GHC.Classes.Eq Data.Clustering.Hierarchical.Internal.DistanceMatrix.Cluster

module Data.Clustering.Hierarchical

-- | Data structure for storing hierarchical clusters. The distance between
--   clusters is stored on the branches. Distances between leafs are the
--   distances between the elements on those leafs, while distances between
--   branches are defined by the linkage used (see <a>Linkage</a>).
data Dendrogram a

-- | The leaf contains the item <tt>a</tt> itself.
Leaf :: a -> Dendrogram a

-- | Each branch connects two clusters/dendrograms that are <tt>d</tt>
--   distance apart.
Branch :: {-# UNPACK #-} !Distance -> Dendrogram a -> Dendrogram a -> Dendrogram a

-- | A distance is simply a synonym of <a>Double</a> for efficiency.
type Distance = Double

-- | List of elements in a dendrogram.
elements :: Dendrogram a -> [a]

-- | <tt>dendro `cutAt` threshold</tt> cuts the dendrogram <tt>dendro</tt>
--   at all branches which have distances strictly greater than
--   <tt>threshold</tt>.
--   
--   For example, suppose we have
--   
--   <pre>
--   dendro = Branch 0.8
--              (Branch 0.5
--                (Branch 0.2
--                  (Leaf 'A')
--                  (Leaf 'B'))
--                (Leaf 'C'))
--              (Leaf 'D')
--   </pre>
--   
--   Then:
--   
--   <pre>
--   dendro `cutAt` 0.9 == dendro `cutAt` 0.8 == [dendro] -- no changes
--   dendro `cutAt` 0.7 == dendro `cutAt` 0.5 == [Branch 0.5 (Branch 0.2 (Leaf 'A') (Leaf 'B')) (Leaf 'C'), Leaf 'D']
--   dendro `cutAt` 0.4 == dendro `cutAt` 0.2 == [Branch 0.2 (Leaf 'A') (Leaf 'B'), Leaf 'C', Leaf 'D']
--   dendro `cutAt` 0.1 == [Leaf 'A', Leaf 'B', Leaf 'C', Leaf 'D'] -- no branches at all
--   </pre>
cutAt :: Dendrogram a -> Distance -> [Dendrogram a]

-- | The linkage type determines how the distance between clusters will be
--   calculated. These are the linkage types currently available on this
--   library.
data Linkage

-- | The distance between two clusters <tt>a</tt> and <tt>b</tt> is the
--   <i>minimum</i> distance between an element of <tt>a</tt> and an
--   element of <tt>b</tt>.
SingleLinkage :: Linkage

-- | The distance between two clusters <tt>a</tt> and <tt>b</tt> is the
--   <i>maximum</i> distance between an element of <tt>a</tt> and an
--   element of <tt>b</tt>.
CompleteLinkage :: Linkage

-- | The same as <a>CompleteLinkage</a>, but using the CLINK algorithm.
--   It's much faster however doesn't always give the best complete linkage
--   dendrogram.
CLINK :: Linkage

-- | Unweighted Pair Group Method with Arithmetic mean, also called
--   "average linkage". The distance between two clusters <tt>a</tt> and
--   <tt>b</tt> is the <i>arithmetic average</i> between the distances of
--   all elements in <tt>a</tt> to all elements in <tt>b</tt>.
UPGMA :: Linkage

-- | This method is usually wrongly called "average linkage". The distance
--   between cluster <tt>a = a1 U a2</tt> (that is, cluster <tt>a</tt> was
--   formed by the linkage of clusters <tt>a1</tt> and <tt>a2</tt>) and an
--   old cluster <tt>b</tt> is <tt>(d(a1,b) + d(a2,b)) / 2</tt>. So when
--   clustering two elements to create a cluster, this method is the same
--   as UPGMA. However, in general when joining two clusters this method
--   assigns equal weights to <tt>a1</tt> and <tt>a2</tt>, while UPGMA
--   assigns weights proportional to the number of elements in each
--   cluster. See, for example:
--   
--   <ul>
--   
--   <li><a>http://www.cs.tau.ac.il/~rshamir/algmb/00/scribe00/html/lec08/node21.html</a>,
--   which defines the real UPGMA and gives the equation to calculate the
--   distance between an old and a new cluster.</li>
--   
--   <li><a>http://github.com/JadeFerret/ai4r/blob/master/lib/ai4r/clusterers/average_linkage.rb</a>,
--   code for "average linkage" on ai4r library implementing what we call
--   here <tt>FakeAverageLinkage</tt> and not UPGMA.</li>
--   </ul>
FakeAverageLinkage :: Linkage

-- | Calculates a complete, rooted dendrogram for a list of items and a
--   linkage type. The following are the time and space complexities for
--   each linkage:
--   
--   <ul>
--   <li><i><a>SingleLinkage</a></i> <i>O(n^2)</i> time and <i>O(n)</i>
--   space, using the SLINK algorithm. This algorithm is optimal in both
--   space and time and gives the same answer as the naive algorithm using
--   a distance matrix.</li>
--   <li><i><a>CompleteLinkage</a></i> <i>O(n^3)</i> time and <i>O(n^2)</i>
--   space, using the naive algorithm with a distance matrix. Use
--   <a>CLINK</a> if you need more performance.</li>
--   <li><i>Complete linkage with <a>CLINK</a></i> <i>O(n^2)</i> time and
--   <i>O(n)</i> space, using the CLINK algorithm. Note that this algorithm
--   doesn't always give the same answer as the naive algorithm using a
--   distance matrix, but it's much faster.</li>
--   <li><i><a>UPGMA</a></i> <i>O(n^3)</i> time and <i>O(n^2)</i> space,
--   using the naive algorithm with a distance matrix.</li>
--   <li><i><a>FakeAverageLinkage</a></i> <i>O(n^3)</i> time and
--   <i>O(n^2)</i> space, using the naive algorithm with a distance
--   matrix.</li>
--   </ul>
dendrogram :: Linkage -> [a] -> (a -> a -> Distance) -> Dendrogram a
