K-means

Principles learned

  • Create a set decision variable
  • Use the operator “count” to get the number of elements in a set
  • Use a lambda expression to compute a sum on a set
  • Use ternary conditions

Problem

../_images/kmeans.png

Given a sample of observations along some dimensions, the goal is to partition these observations into k clusters. Clusters are defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.

Download the example

Data

The format of the data is as follows:

  • 1st line: number of observations and number of dimensions
  • For each observation: coordinate along each dimension, and the actual cluster it belongs to

Program

The model implemented here makes use of set variables. For every cluster, we define a set which describes the observations assigned to that cluster. Those sets are constrained to form a partition, which means that an observation must be assigned to exactly one cluster. For each cluster, we compute the centroid of the observations in the cluster, from which we can obtain the variance of the cluster. The variance of a cluster is defined as the sum of the respective squared euclidian distances between the centroid and every element of the cluster. The objective is to minimize the sum of these variances.

Execution:
localsolver kmeans.lsp inFileName=instances/iris.dat [lsTimeLimit=] [solFileName=] [k=val]
/********** kmeans.lsp **********/
use io;

/* Reads instance data */
function input() {
    usage = "\nUsage: localsolver kmeans.lsp "
        + "inFileName=inputFile [solFileName=outputFile] [lsTimeLimit=timeLimit] [k=value]\n";

    if (inFileName == nil) throw usage;
    local f = io.openRead(inFileName);
    nbObservations = f.readInt();
    nbDimensions = f.readInt();

    if (k == nil) {
        k = 2;
    }

    for [o in 0..nbObservations-1] {
        for [d in 0..nbDimensions-1] {
            coordinates[o][d] = f.readDouble();
        }
        initialClusters[o] = f.readString();
    }
}

/* Declares the optimization model. */
function model() {
    // Set decisions: clusters[c] represents the points in cluster c
    clusters[1..k] <- set(nbObservations);

    // Each point must be in one cluster and one cluster only
    constraint partition[c in 1..k](clusters[c]);

    // Compute variances
    for [c in 1..k] {
        local cluster <- clusters[c];
        local size <- count(cluster);

        // Compute the centroid of the cluster
        centroid[d in 0..nbDimensions-1] <- size == 0 ? 0 :
                sum(cluster, o => coordinates[o][d]) / size;

        // Compute the variance of the cluster
        squares[d in 0..nbDimensions-1] <- sum(cluster,
            o => pow(coordinates[o][d] - centroid[d], 2));
        variances[c] <- sum[d in 0..nbDimensions-1](squares[d]);
    }

    // Minimize the total variance
    obj <- sum[c in 1..k](variances[c]);
    minimize obj;
}

/* Parameterizes the solver. */
function param() {
    if (lsTimeLimit == nil) lsTimeLimit = 5;
}

/* Writes the solution in a file in the following format:
 *  - objective value
 *  - k
 *  - for each cluster, a line with the elements in the cluster (separated by spaces)
 */
function output() {
    if (solFileName == nil) return;
    local solFile = io.openWrite(solFileName);
    solFile.println(obj.value);
    solFile.println(k);
    for [c in 1..k] {
        for [o in clusters[c].value]
            solFile.print(o + " ");
        solFile.println();
    }
}
Execution (Windows)
set PYTHONPATH=%LS_HOME%\bin\python
python kmeans.py instances\iris.dat
Execution (Linux)
export PYTHONPATH=/opt/localsolver_10_0/bin/python
python kmeans.py instances/iris.dat
########## kmeans.py ##########

import localsolver
import sys

if len(sys.argv) < 2:
    print("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
    sys.exit(1)

def read_elem(filename):
    with open(filename) as f:
        return [str(elem) for elem in f.read().split()]

with localsolver.LocalSolver() as ls:

    #
    # Reads instance data
    #
    if len(sys.argv) > 4: k = int(sys.argv[4])
    else: k = 2

    file_it = iter(read_elem(sys.argv[1]))
    
    # Data properties
    nb_observations = int(next(file_it))
    nb_dimensions = int(next(file_it))

    coordinates = [None]*nb_observations
    initial_clusters = [None]*nb_observations
    for o in range(nb_observations):
        coordinates[o] = [None]*(nb_dimensions)
        for d in range(nb_dimensions):
            coordinates[o][d] = float(next(file_it))
        initial_clusters[o] = next(file_it)

    #
    # Declares the optimization model
    #
    model = ls.model

    # clusters[c] represents the points in cluster c
    clusters = [model.set(nb_observations) for c in range(k)]

    # Each point must be in one cluster and one cluster only
    model.constraint(model.partition(clusters))

    # Coordinates of points
    coordinates_array = model.array(coordinates)

    # Compute variances
    variances = []
    for cluster in clusters:
        size = model.count(cluster)

        # Compute centroid of cluster
        centroid = [0 for d in range(nb_dimensions)]
        for d in range(nb_dimensions):
            coordinate_selector = model.lambda_function(
                    lambda i: model.at(coordinates_array, i, d))
            centroid[d] = model.iif(size == 0, 0,
                    model.sum(cluster, coordinate_selector) / size)

        # Compute variance of cluster
        variance = model.sum()
        for d in range(nb_dimensions):
            dimension_variance_selector = model.lambda_function(lambda i: model.sum(
                    model.pow(model.at(coordinates_array, i, d) - centroid[d], 2)))
            dimension_variance = model.sum(cluster, dimension_variance_selector)
            variance.add_operand(dimension_variance)
        variances.append(variance)

    # Minimize the total variance
    obj = model.sum(variances)
    model.minimize(obj)

    model.close()

    #
    # Parameterizes the solver
    #
    if len(sys.argv) > 3: ls.param.time_limit = int(sys.argv[3])
    else: ls.param.time_limit = 5

    ls.solve()

    #
    # Writes the solution in a file in the following format:
    #  - objective value
    #  - k
    #  - for each cluster, a line with the elements in the cluster (separated by spaces)
    #
    if len(sys.argv) > 2:
        with open(sys.argv[2], 'w') as f:
            f.write("%f\n" % obj.value)
            f.write("%d\n" % k)
            for c in range(k):
                for o in clusters[c].value:
                    f.write("%d " % o)
                f.write("\n")
Compilation / Execution (Windows)
cl /EHsc kmeans.cpp -I%LS_HOME%\include /link %LS_HOME%\bin\localsolver100.lib
kmeans instances\iris.dat
Compilation / Execution (Linux)
g++ kmeans.cpp -I/opt/localsolver_10_0/include -llocalsolver100 -lpthread -o kmeans
./kmeans instances/iris.dat
//********* kmeans.cpp *********

#include <iostream>
#include <fstream>
#include <vector>
#include <limits>
#include "localsolver.h"

using namespace localsolver;
using namespace std;

class Kmeans {
public:
    // Data properties 
    int nbObservations;
    int nbDimensions;
    int k;

    vector<vector<lsdouble> > coordinates;
    vector<string> initialClusters;

    // Solver. 
    LocalSolver localsolver;

    // Decisions 
    vector<LSExpression> clusters;

    // Objective 
    LSExpression obj;

    // Constructor 
    Kmeans(int k) : k(k) { }

    // Reads instance data 
    void readInstance(const string& fileName) {
        ifstream infile;
        infile.exceptions(ifstream::failbit | ifstream::badbit);
        infile.open(fileName.c_str());

        infile >> nbObservations;
        infile >> nbDimensions;

        coordinates.resize(nbObservations);
        initialClusters.resize(nbObservations);
        for (int o = 0; o < nbObservations; o++) {
            coordinates[o].resize(nbDimensions);
            for (int d = 0; d < nbDimensions; d++) {
                infile >> coordinates[o][d];
            }
            infile >> initialClusters[o];
        }
    }

    void solve(int limit) {
        // Declares the optimization model. 
        LSModel model = localsolver.getModel();

        // Set decisions: clusters[c] represents the points in cluster c
        clusters.resize(k);
        for (int c = 0; c < k; ++c) {
            clusters[c] = model.setVar(nbObservations);
        }

        // Each point must be in one cluster and one cluster only
        model.constraint(model.partition(clusters.begin(), clusters.end()));

        // Coordinates of points
        LSExpression coordinatesArray = model.array();
        for (int o = 0; o < nbObservations; o++) {
            coordinatesArray.addOperand(model.array(coordinates[o].begin(),
                                                    coordinates[o].end()));
        }

        // Compute variances
        vector<LSExpression> variances;
        variances.resize(k);
        for (int c = 0; c < k; c++) {
            LSExpression cluster = clusters[c];
            LSExpression size = model.count(cluster);

            // Compute the centroid of the cluster
            LSExpression centroid = model.array();
            for (int d = 0; d < nbDimensions; d++) {
                LSExpression coordinateSelector = model.createLambdaFunction(
                    [&](LSExpression o) { return model.at(coordinatesArray, o, d); });
                centroid.addOperand(model.iif(size == 0, 0,
                    model.sum(cluster, coordinateSelector) / size));
            }

            // Compute the variance of the cluster
            LSExpression variance = model.sum();
            for (int d = 0; d < nbDimensions; d++) {
                LSExpression dimensionVarianceSelector = model.createLambdaFunction(
                    [&](LSExpression o) { 
                        return model.pow(model.at(coordinatesArray, o, d) 
                                         - model.at(centroid, d), 2);
                    });
                LSExpression dimensionVariance = model.sum(cluster,
                    dimensionVarianceSelector);
                variance.addOperand(dimensionVariance);
            }
            variances[c] = variance;
        }

        // Minimize the total variance
        obj = model.sum(variances.begin(), variances.end());
        model.minimize(obj);

        model.close();

        // Parameterizes the solver. 
        localsolver.getParam().setTimeLimit(limit);

        localsolver.solve(); 
    }

    // Writes the solution in a file in the following format:
    //  - objective value
    //  - k
    //  - for each cluster, a line with the elements in the cluster (separated by spaces)
    void writeSolution(const string& fileName) {
        ofstream outfile;
        outfile.exceptions(ofstream::failbit | ofstream::badbit);
        outfile.open(fileName.c_str());

        outfile << obj.getDoubleValue() << endl;
        outfile << k << endl;
        for (int c = 0; c < k; c++) {
            LSCollection clusterCollection = clusters[c].getCollectionValue();
            for (int i = 0; i < clusterCollection.count(); i++) {
                    outfile << clusterCollection[i] << " ";
                }
            outfile << endl;
        }
    }
};
    
int main(int argc, char** argv) {
    if (argc < 2) {
        cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
        return 1;
    }

    const char* instanceFile = argv[1];
    const char* solFile = argc > 2 ? argv[2] : NULL;
    const char* strTimeLimit = argc > 3 ? argv[3] : "5";
    const char* k = argc > 4 ? argv[4] : "2";

    try {
        Kmeans model(atoi(k));
        model.readInstance(instanceFile);
        model.solve(atoi(strTimeLimit));
        if (solFile != NULL) model.writeSolution(solFile);
        return 0;
    } catch (const exception& e) {
        cerr << "An error occurred: " << e.what() << endl;
        return 1;
    }
}
Compilation / Execution (Windows)
copy %LS_HOME%\bin\localsolvernet.dll .
csc Kmeans.cs /reference:localsolvernet.dll
Kmeans instances\iris.dat
/********** kmeans.cs **********/

using System;
using System.IO;
using System.Globalization;
using localsolver;

public class Kmeans : IDisposable
{
    // Data properties
    int nbObservations;
    int nbDimensions;
    int k;

    double[][] coordinates;
    string[] initialClusters;

    // Solver.
    LocalSolver localsolver;

    // Decisions.
    LSExpression[] clusters;

    // Objective.
    LSExpression obj;

    public Kmeans(int k)
    {
        localsolver = new LocalSolver();
        this.k = k;
    }

    // Reads instance data 
    public void ReadInstance(string fileName)
    {
        using (StreamReader input = new StreamReader(fileName))
        {
            string[] splittedLine = input.ReadLine().Split();

            nbObservations = int.Parse(splittedLine[0]);
            nbDimensions = int.Parse(splittedLine[1]);

            coordinates = new double[nbObservations][];
            initialClusters = new string[nbObservations];
            for (int o = 0; o < nbObservations; o++)
            {
                splittedLine = input.ReadLine().Split();
                coordinates[o] = new double[nbDimensions];
                for (int d = 0; d < nbDimensions; d++)
                    coordinates[o][d] = double.Parse(splittedLine[d], 
                        CultureInfo.InvariantCulture);
                initialClusters[o] = splittedLine[nbDimensions];
            }
        }
    }

    public void Dispose()
    {
        if (localsolver != null)
            localsolver.Dispose();
    }

    public void Solve(int limit)
    {
        // Declares the optimization model
        LSModel model = localsolver.GetModel();

        // Set decisions: clusters[c] represents the points in cluster c
        clusters = new LSExpression[k];
        for (int c = 0; c < k; c++)
            clusters[c] = model.Set(nbObservations);

        // Each point must be in one cluster and one cluster only
        model.Constraint(model.Partition(clusters));

        // Coordinates of points
        LSExpression coordinatesArray = model.Array(coordinates);

        // Compute variances
        LSExpression[] variances = new LSExpression[k];
        for (int c = 0; c < k; c++) 
        {
            LSExpression cluster =  clusters[c];
            LSExpression size = model.Count(cluster);

            // Compute the centroid of the cluster
            LSExpression centroid = model.Array();
            for (int d = 0; d < nbDimensions; d++)
            {
                LSExpression coordinateSelector = model.LambdaFunction(
                    o => model.At(coordinatesArray, o, model.CreateConstant(d)));
                centroid.AddOperand(model.If(size == 0, 0,
                    model.Sum(cluster, coordinateSelector) / size));
            }

            // Compute the variance of the cluster
            LSExpression variance = model.Sum();
            for (int d = 0; d < nbDimensions; d++)
            {
                LSExpression dimensionVarianceSelector = model.LambdaFunction(
                    o => model.Pow(model.At(coordinatesArray, o, model.CreateConstant(d))
                                   - model.At(centroid, model.CreateConstant(d)), 2)
                    );
                LSExpression dimensionVariance = model.Sum(cluster, 
                    dimensionVarianceSelector);
                variance.AddOperand(dimensionVariance);
            }
            variances[c] = variance;
        }

        // Minimize the total variance
        obj = model.Sum(variances);
        model.Minimize(obj);

        model.Close();

        // Parameterizes the solver.
        localsolver.GetParam().SetTimeLimit(limit);

        localsolver.Solve();
    }

    // Writes the solution in a file in the following format:
    //  - objective value
    //  - k
    //  - for each cluster, a line with the elements in the cluster (separated by spaces)
    public void WriteSolution(string fileName)
    {
        using (StreamWriter output = new StreamWriter(fileName))
        {
            output.WriteLine(obj.GetDoubleValue());
            output.WriteLine(k);
            for (int c = 0; c < k; c++)
            {
                LSCollection clusterCollection = clusters[c].GetCollectionValue();
                for (int i = 0; i < clusterCollection.Count(); i++)
                    output.Write(clusterCollection[i] + " ");
                output.WriteLine();
            }
            output.Close();
        }
    }

    public static void Main(string[] args)
    {
        if (args.Length < 1)
        {
            Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
            Environment.Exit(1);
        }
        
        string instanceFile = args[0];
        string outputFile = args.Length > 1 ? args[1] : null;
        string strTimeLimit = args.Length > 2 ? args[2] : "5";
        string k = args.Length > 3 ? args[3] : "2";
        using (Kmeans model = new Kmeans(int.Parse(k)))
        {
            model.ReadInstance(instanceFile);
            model.Solve(int.Parse(strTimeLimit));
            if (outputFile != null)
            {
                model.WriteSolution(outputFile);
            }
        }
    }
}
Compilation / Execution (Windows)
javac Kmeans.java -cp %LS_HOME%\bin\localsolver.jar
java -cp %LS_HOME%\bin\localsolver.jar;. Kmeans instances\iris.dat
Compilation / Execution (Linux)
javac Kmeans.java -cp /opt/localsolver_10_0/bin/localsolver.jar
java -cp /opt/localsolver_10_0/bin/localsolver.jar:. Kmeans instances/iris.dat
/********** Kmeans.java **********/

import java.util.*;
import java.io.*;
import localsolver.*;

public class Kmeans {
    // Data properties
    private int nbObservations;
    private int nbDimensions;
    private int k;

    private double[][] coordinates;
    private String[] initialClusters;

    // Solver.
    private final LocalSolver localsolver;

    // Decisions.
    private LSExpression[] clusters;

    // Objective.
    private LSExpression obj;

    private Kmeans(LocalSolver localsolver) {
        this.localsolver = localsolver;
    }

    // Reads instance data
    private void readInstance(int k, String fileName) throws IOException {
        try (Scanner input = new Scanner(new File(fileName))) {
            input.useLocale(Locale.ROOT);

            nbObservations = input.nextInt();
            nbDimensions = input.nextInt();

            this.k = k;
            coordinates = new double[nbObservations][nbDimensions];
            initialClusters = new String[nbObservations];
            for (int o = 0; o < nbObservations; o++) {
                for (int d = 0; d < nbDimensions; d++) {
                    coordinates[o][d] = input.nextDouble();
                }
                initialClusters[o] = input.next();
            }
        }
    }

    private void solve(int limit) {
        // Declares the optimization model.
        LSModel model = localsolver.getModel();

        // Set decisions: clusters[c] represents the points in cluster c
        clusters = new LSExpression[k];
        for (int c = 0; c < k; c++) {
            clusters[c] = model.setVar(nbObservations);
        }

        // Each point must be in one cluster and one cluster only
        model.constraint(model.partition(clusters));

        // Coordinates of points
        LSExpression coordinatesArray = model.array(coordinates);
        
        // Compute variances
        LSExpression[] variances = new LSExpression[k];
        for (int c = 0; c < k; c++) {
            LSExpression cluster = clusters[c];
            LSExpression size = model.count(cluster);

            // Compute the centroid of the cluster
            LSExpression centroid = model.array();
            for (int d = 0; d < nbDimensions; d++) {
                LSExpression vExpr = model.createConstant(d);
                LSExpression coordinateSelector = model.lambdaFunction(
                    o -> model.at(coordinatesArray, o, vExpr));
                centroid.addOperand(model.iif(model.eq(size, 0), 0,
                    model.div(model.sum(cluster, coordinateSelector), size)));
            }

            // Compute the variance of the cluster
            LSExpression variance = model.sum();
            for (int d = 0; d < nbDimensions; d++) {
                LSExpression vExpr = model.createConstant(d);
                LSExpression dimensionVarianceSelector = model.lambdaFunction(
                    o -> model.pow(model.sub(model.at(coordinatesArray, o, vExpr),
                                   model.at(centroid, vExpr)), 2)
                    );
                LSExpression dimensionVariance = model.sum(cluster,
                    dimensionVarianceSelector);
                variance.addOperand(dimensionVariance);
            }
            variances[c] = variance;
        }

        // Minimize the total variance
        obj = model.sum(variances);
        model.minimize(obj);

        model.close();

        // Parameterizes the solver.
        localsolver.getParam().setTimeLimit(limit);

        localsolver.solve();
    }

    // Writes the solution in a file in the following format:
    // - objective value
    // - k
    // - for each cluster, a line with the elements in the cluster (separated by spaces)
    private void writeSolution(String fileName) throws IOException {
        try (PrintWriter output = new PrintWriter(fileName)) {
            output.println(obj.getDoubleValue());
            output.println(k);
            for (int c = 0; c < k; c++) {
                LSCollection clusterCollection = clusters[c].getCollectionValue();
                for (int i = 0; i < clusterCollection.count(); i++) {
                    output.print(clusterCollection.get(i) + " ");
                }
                output.println();
            }
        }
    }

    public static void main(String[] args) {
        if (args.length < 1) {
            System.err.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
            System.exit(1);
        }

        String instanceFile = args[0];
        String outputFile = args.length > 1 ? args[1] : null;
        String strTimeLimit = args.length > 2 ? args[2] : "5";
        String k = args.length > 3 ? args[3] : "2";

        try (LocalSolver localsolver = new LocalSolver()) {
            Kmeans model = new Kmeans(localsolver);
            model.readInstance(Integer.parseInt(k), instanceFile);
            model.solve(Integer.parseInt(strTimeLimit));
            if (outputFile != null) {
                model.writeSolution(outputFile);
            }
        } catch (Exception ex) {
            System.err.println(ex);
            ex.printStackTrace();
            System.exit(1);
        }
    }
}