Alternative KMeansModel Construction

How to create K-Means Models using the KMeansModel companion Object

Training a K-Means model from a set of points using KMeans.train is one way to create a KMeansModel. However, there are many others that are useful. The KMeansModel companion object provides a number of these constructors.

package com.massivedatascience.clusterer

object KMeansModel {

  /**
   * Create a K-means model from given cluster centers and weights
   *
   * @param ops distance function
   * @param centers initial cluster centers in homogeneous coordinates
   * @param weights initial cluster weights
   * @return  k-means model
   */
  def fromVectorsAndWeights(
    ops: BregmanPointOps,
    centers: IndexedSeq[Vector],
    weights: IndexedSeq[Double]) = ???

  /**
   * Create a K-means model from given weighted vectors
   *
   * @param ops distance function
   * @param centers initial cluster centers as weighted vectors
   * @return  k-means model
   */
  def fromWeightedVectors[T <: WeightedVector : ClassTag](
    ops: BregmanPointOps,
    centers: IndexedSeq[T]) = ???

  /**
   * Create a K-means model by selecting a set of k points at random
   *
   * @param ops distance function
   * @param k number of centers desired
   * @param dim dimension of space
   * @param weight initial weight of points
   * @param seed random number seed
   * @return  k-means model
   */
  def usingRandomGenerator(ops: BregmanPointOps,
    k: Int,
    dim: Int,
    weight: Double,
    seed: Long = XORShiftRandom.random.nextLong()) = ???

  /**
   * Create a K-Means model using the KMeans++ algorithm on an initial set of candidate centers
   *
   * @param ops distance function
   * @param data initial candidate centers
   * @param weights initial weights
   * @param k number of clusters desired
   * @param perRound number of candidates to add per round
   * @param numPreselected initial sub-sequence of candidates to always select
   * @param seed random number seed
   * @return  k-means model
   */
  def fromCenters[T <: WeightedVector : ClassTag](
    ops: BregmanPointOps,
    data: IndexedSeq[T],
    weights: IndexedSeq[Double],
    k: Int,
    perRound: Int,
    numPreselected: Int,
    seed: Long = XORShiftRandom.random.nextLong()): KMeansModel = ???

  /**
   * Create a K-Means Model from a streaming k-means model.
   *
   * @param streamingKMeansModel mutable streaming model
   * @return immutable k-means model
   */
  def fromStreamingModel(streamingKMeansModel: StreamingKMeansModel): KMeansModel = ???

  /**
   * Create a K-Means Model from a set of assignments of points to clusters
   *
   * @param ops distance function
   * @param points initial bregman points
   * @param assignments assignments of points to clusters
   * @return
   */
  def fromAssignments[T <: WeightedVector : ClassTag](
    ops: BregmanPointOps,
    points: RDD[T],
    assignments: RDD[Int]): KMeansModel = ???

  /**
   * Create a K-Means Model using K-Means || algorithm from an RDD of Bregman points.
   *
   * @param ops distance function
   * @param data initial points
   * @param k  number of cluster centers desired
   * @param numSteps number of iterations of k-Means ||
   * @param sampleRate fractions of points to use in weighting clusters
   * @param seed random number seed
   * @return  k-means model
   */
  def usingKMeansParallel[T <: WeightedVector : ClassTag](
    ops: BregmanPointOps,
    data: RDD[T],
    k: Int,
    numSteps: Int = 2,
    sampleRate: Double = 1.0,
    seed: Long = XORShiftRandom.random.nextLong()): KMeansModel = ???

  /**
   * Construct a K-Means model using the Lloyd's algorithm given a set of initial
   * K-Means models.
   *
   * @param ops distance function
   * @param data points to fit
   * @param initialModels  initial k-means models
   * @param clusterer k-means clusterer to use
   * @param seed random number seed
   * @return  the best K-means model found
   */
  def usingLloyds[T <: WeightedVector : ClassTag](
    ops: BregmanPointOps,
    data: RDD[T],
    initialModels: Seq[KMeansModel],
    clusterer: MultiKMeansClusterer = new ColumnTrackingKMeans(),
    seed: Long = XORShiftRandom.random.nextLong()): KMeansModel = ???
}

PreviousIterative Clustering NextCustomizing

Last updated 1 year ago

package com.massivedatascience.clusterer object KMeansModel { /** * Create a K-means model from given cluster centers and weights * * @param ops distance function * @param centers initial cluster centers in homogeneous coordinates * @param weights initial cluster weights * @return k-means model */ def fromVectorsAndWeights( ops: BregmanPointOps, centers: IndexedSeq[Vector], weights: IndexedSeq[Double]) = ??? /** * Create a K-means model from given weighted vectors * * @param ops distance function * @param centers initial cluster centers as weighted vectors * @return k-means model */ def fromWeightedVectors[T <: WeightedVector : ClassTag]( ops: BregmanPointOps, centers: IndexedSeq[T]) = ??? /** * Create a K-means model by selecting a set of k points at random * * @param ops distance function * @param k number of centers desired * @param dim dimension of space * @param weight initial weight of points * @param seed random number seed * @return k-means model */ def usingRandomGenerator(ops: BregmanPointOps, k: Int, dim: Int, weight: Double, seed: Long = XORShiftRandom.random.nextLong()) = ??? /** * Create a K-Means model using the KMeans++ algorithm on an initial set of candidate centers * * @param ops distance function * @param data initial candidate centers * @param weights initial weights * @param k number of clusters desired * @param perRound number of candidates to add per round * @param numPreselected initial sub-sequence of candidates to always select * @param seed random number seed * @return k-means model */ def fromCenters[T <: WeightedVector : ClassTag]( ops: BregmanPointOps, data: IndexedSeq[T], weights: IndexedSeq[Double], k: Int, perRound: Int, numPreselected: Int, seed: Long = XORShiftRandom.random.nextLong()): KMeansModel = ??? /** * Create a K-Means Model from a streaming k-means model. * * @param streamingKMeansModel mutable streaming model * @return immutable k-means model */ def fromStreamingModel(streamingKMeansModel: StreamingKMeansModel): KMeansModel = ??? /** * Create a K-Means Model from a set of assignments of points to clusters * * @param ops distance function * @param points initial bregman points * @param assignments assignments of points to clusters * @return */ def fromAssignments[T <: WeightedVector : ClassTag]( ops: BregmanPointOps, points: RDD[T], assignments: RDD[Int]): KMeansModel = ??? /** * Create a K-Means Model using K-Means || algorithm from an RDD of Bregman points. * * @param ops distance function * @param data initial points * @param k number of cluster centers desired * @param numSteps number of iterations of k-Means || * @param sampleRate fractions of points to use in weighting clusters * @param seed random number seed * @return k-means model */ def usingKMeansParallel[T <: WeightedVector : ClassTag]( ops: BregmanPointOps, data: RDD[T], k: Int, numSteps: Int = 2, sampleRate: Double = 1.0, seed: Long = XORShiftRandom.random.nextLong()): KMeansModel = ??? /** * Construct a K-Means model using the Lloyd's algorithm given a set of initial * K-Means models. * * @param ops distance function * @param data points to fit * @param initialModels initial k-means models * @param clusterer k-means clusterer to use * @param seed random number seed * @return the best K-means model found */ def usingLloyds[T <: WeightedVector : ClassTag]( ops: BregmanPointOps, data: RDD[T], initialModels: Seq[KMeansModel], clusterer: MultiKMeansClusterer = new ColumnTrackingKMeans(), seed: Long = XORShiftRandom.random.nextLong()): KMeansModel = ??? }