K-means

Principles learned

  • Define the actual set of decision variables
  • Define the float bounds from the data
  • Use of non-linearities: minimizing a sum of “min”

Problem

../_images/kmeans.png

Given a sample of n observations and a set of quantitative variables, the goal is to partition these observations into k clusters. Clusters are defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.

Download the example

Data

The format of the data is as follows:

  • 1st line: number of observations and number of variables
  • For each observation: coordinate along each variable, and the actual cluster it belongs to

Program

The only decision variables are x[c][v] to define the coordinate of the center of gravity of cluster c along variable v. There is no need for variables to determine which cluster an observation belongs to: the distance with the closest cluster is computed thanks to the min operator. The objective is to minimize the sum of these distances.

Execution:
localsolver kmeans.lsp inFileName=instances/irit.dat [lsTimeLimit=] [solFileName=] [k=val]
/********** kmeans.lsp **********/
use io;

/* Reads instance data */
function input(){
    usage = "\nUsage: localsolver kmeans.lsp "
        + "inFileName=inputFile [solFileName=outputFile] [lsTimeLimit=timeLimit] [k=value]\n";

    if (inFileName == nil) throw usage;
    local f = io.openRead(inFileName);
    nbObservations = f.readInt();
    nbVariables = f.readInt();
    
    if (k == nil){
        k = 2;
    }
    
    for[o in 1..nbObservations]{
        for[v in 1..nbVariables]{
	        M[o][v] = f.readDouble();
	    }
        clusters[o] = f.readString();
    }
    
    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    for[v in 1..nbVariables]{
        mini[v] = min[o in 1..nbObservations](M[o][v]);
        maxi[v] = max[o in 1..nbObservations](M[o][v]);
    }
}

/* Declares the optimization model. */
function model(){
    // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
    x[1..k][v in 1..nbVariables] <- float(mini[v], maxi[v]);

    // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
    d[o in 1..nbObservations] <- min[c in 1..k](sum[v in 1..nbVariables](pow(x[c][v] - M[o][v], 2)));

    // Minimize the total distance
    obj <- sum[o in 1..nbObservations](d[o]);
    minimize obj;
}

/* Parameterizes the solver. */
function param(){
    if(lsTimeLimit == nil) lsTimeLimit=5;
}

/* Writes the solution in a file in the following format:
 *  - objective value
 *  - k
 *  - for each cluster, the coordinates along each variable
 */
function output(){
    if(solFileName == nil) return;
	local solFile = io.openWrite(solFileName);
    solFile.println(obj.value);
    solFile.println(k);
    for[c in 1..k] {
        for[v in 1..nbVariables] {
            solFile.print(x[c][v].value + " ");
        }
        solFile.println();
    }
}
Execution (Windows)
set PYTHONPATH=%LS_HOME%\bin\python27\
python kmeans.py instances\irit.dat
Execution (Linux)
export PYTHONPATH=/opt/localsolver_XXX/bin/python27/
python kmeans.py instances/irit.dat
########## kmeans.py ##########

import localsolver
import sys

if len(sys.argv) < 2:
    print ("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
    sys.exit(1)

def read_elem(filename):
    with open(filename) as f:
        return [str(elem) for elem in f.read().split()]

with localsolver.LocalSolver() as ls:

    #
    # Reads instance data
    #
    if len(sys.argv) > 4: k = int(sys.argv[4])
    else: k = 2

    file_it = iter(read_elem(sys.argv[1]))
    # Data properties 
    nbObservations = int(file_it.next())
    nbVariables = int(file_it.next())

    M = [None]*nbObservations
    qualitative = [None]*nbObservations
    for o in range(nbObservations):
        M[o] = [None]*(nbVariables)
        for v in range(nbVariables):
            M[o][v] = float(file_it.next())
        qualitative[o] = file_it.next()

    # mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    mini = [min(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]
    maxi = [max(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]

    #
    # Declares the optimization model
    #
    model = ls.model

    # x[o][v] are decision variables equal to the center of gravity of the kth group for the jit variable
    x = [[model.float(mini[v], maxi[v]) for v in range(nbVariables)] for c in range(k)]

    # d[o] is the distance between observation o and the nearest cluster (its center of gravity)
    d = [model.min(model.sum((x[c][v] - M[o][v])**2 for v in range(nbVariables)) for c in range(k))
        for o in range(nbObservations)]

    # Minimize the total distance
    obj = model.sum(d)
    model.minimize(obj)

    model.close()

    #
    # Parameterizes the solver
    #
    ls.param.nb_threads = 2
    if len(sys.argv) > 3: ls.create_phase().time_limit = int(sys.argv[3])
    else: ls.create_phase().time_limit = 5
  
    ls.solve()

    #
    # Writes the solution in a file in the following format:
    #  - objective value
    #  - k
    #  - for each cluster, the coordinates along each variable
    #
    if len(sys.argv) > 2:
        with open(sys.argv[2], 'w') as f:
            f.write("%f\n" % obj.value)
            f.write("%d\n" % k)
            for c in range(k):
                for v in range(nbVariables):
                    f.write("%f " % x[c][v].value)
                f.write("\n")
Compilation / Execution (Windows)
cl /EHsc kmeans.cpp -I%LS_HOME%\include /link %LS_HOME%\bin\localsolver.dll.lib
kmeans instances\irit.dat
Compilation / Execution (Linux)
g++ kmeans.cpp -I/opt/localsolver_XXX/include -llocalsolver -lpthread -o kmeans
./kmeans instances/irit.dat
//********* kmeans.cpp *********

#include <iostream>
#include <fstream>
#include <vector>
#include <limits>
#include "localsolver.h"

using namespace localsolver;
using namespace std;

class Kmeans {
public:
    // Data properties 
    int nbObservations;
    int nbVariables;
    int k;

    vector<vector<lsdouble> > M;
    vector<string> clusters;

    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    vector<lsdouble> mini;
    vector<lsdouble> maxi;

    // Solver. 
    LocalSolver localsolver;

    // Decisions 
    vector<vector<LSExpression> > x;

    // Objective 
    LSExpression obj;

    // Constructor 
    Kmeans(int k) : k(k) { }

    // Reads instance data 
    void readInstance(const string& fileName) {
        ifstream infile;
        infile.exceptions(ifstream::failbit | ifstream::badbit);
        infile.open(fileName.c_str());

        infile >> nbObservations;
        infile >> nbVariables;

        M.resize(nbObservations);
        clusters.resize(nbObservations);
        mini.resize(nbVariables, numeric_limits<lsdouble>::infinity());
        maxi.resize(nbVariables, -numeric_limits<lsdouble>::infinity());
        for (int o = 0; o < nbObservations; o++) {
            M[o].resize(nbVariables);
            for (int v = 0; v < nbVariables; v++) {
                infile >> M[o][v];
                if (M[o][v] < mini[v]) mini[v] = M[o][v];
                if (M[o][v] > maxi[v]) maxi[v] = M[o][v];
            }
            infile >> clusters[o];
        }
    }

    void solve(int limit) {
        // Declares the optimization model. 
        LSModel model = localsolver.getModel();

        // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
        x.resize(k);
        for (int c = 0; c < k; c++) {
            x[c].resize(nbVariables);
            for (int v = 0; v < nbVariables; v++) {
                x[c][v] = model.floatVar(mini[v], maxi[v]);
            }
        }

        // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
        vector<LSExpression> d(nbObservations);            
        for(int o = 0; o < nbObservations; o++) {
            d[o] = model.min();
            for(int c = 0; c < k; c++) {
                LSExpression distanceCluster = model.sum();
                for(int v = 0; v < nbVariables; v++) {
                    distanceCluster += model.pow(x[c][v] - M[o][v], 2);
                }
                d[o].addOperand(distanceCluster);
            }
        }

        // Minimize the total distance
        obj = model.sum(d.begin(), d.end());
        model.minimize(obj);

        model.close();

        // Parameterizes the solver. 
        LSPhase phase = localsolver.createPhase();
        phase.setTimeLimit(limit);
        localsolver.solve(); 
    }

    // Writes the solution in a file in the following format:
    //  - objective value
    //  - k
    //  - for each cluster, the coordinates along each variable
    void writeSolution(const string& fileName) {
        ofstream outfile;
        outfile.exceptions(ofstream::failbit | ofstream::badbit);
        outfile.open(fileName.c_str());

        outfile << obj.getDoubleValue() << endl;
        outfile << k << endl;
        for(int c = 0; c < k; c++) {
            for(int v = 0; v < nbVariables; v++) {
                outfile << x[c][v].getDoubleValue() << " ";
            }
            outfile << endl;
        }
    }
};
    
int main(int argc, char** argv) {
    if (argc < 2) {
        cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
        return 1;
    }

    const char* instanceFile = argv[1];
    const char* solFile = argc > 2 ? argv[2] : NULL;
    const char* strTimeLimit = argc > 3 ? argv[3] : "5";
    const char* k = argc > 4 ? argv[4] : "2";

    try {
        Kmeans model(atoi(k));
        model.readInstance(instanceFile);
        model.solve(atoi(strTimeLimit));
        if(solFile != NULL) model.writeSolution(solFile);
        return 0;
    } catch (const exception& e){
        cerr << "Error occurred: " << e.what() << endl;
        return 1;
    }
}
Compilation/Execution (Windows)
copy %LS_HOME%\bin\*net.dll .
csc Kmeans.cs /reference:localsolvernet.dll
Kmeans instances\irit.dat
/********** kmeans.cs **********/

using System;
using System.IO;
using System.Globalization;
using localsolver;

public class Kmeans : IDisposable
{
    // Data properties
    int nbObservations;
    int nbVariables;
    int k;

    double[][] M;
    string[] clusters;

    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    double[] mini;
    double[] maxi;

    // Solver.
    LocalSolver localsolver;

    // Decisions.
    LSExpression[][] x;

    // Objective.
    LSExpression obj;

    public Kmeans(int k)
    {
        localsolver = new LocalSolver();
        this.k = k;
    }

    // Reads instance data 
    public void ReadInstance(string fileName)
    {
        using (StreamReader input = new StreamReader(fileName))
        {
            string[] splittedLine = input.ReadLine().Split();

            nbObservations = int.Parse(splittedLine[0]);
            nbVariables = int.Parse(splittedLine[1]);

            mini = new double[nbVariables];
            maxi = new double[nbVariables];
            for (int v = 0; v < nbVariables; v++)
            {
                mini[v] = double.PositiveInfinity;
                maxi[v] = double.NegativeInfinity;
            }
            M = new double[nbObservations][];;
            clusters = new string[nbObservations];
            for (int o = 0; o < nbObservations; o++)
            {
                splittedLine = input.ReadLine().Split();
                M[o] = new double[nbVariables];
                for (int v = 0; v < nbVariables; v++)
                {
                    M[o][v] = double.Parse(splittedLine[v], CultureInfo.InvariantCulture);
                    mini[v] = Math.Min(M[o][v], mini[v]);
                    maxi[v] = Math.Max(M[o][v], maxi[v]);
                }
                clusters[o] = splittedLine[nbVariables];
            }
        }
    }

    public void Dispose()
    {
        if (localsolver != null)
            localsolver.Dispose();
    }

    public void Solve(int limit)
    {
        // Declares the optimization model
        LSModel model = localsolver.GetModel();

        // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
        x = new LSExpression[k][];
        for (int o = 0; o < k; o++)
        {
            x[o] = new LSExpression[nbVariables];
            for (int v = 0; v < nbVariables; v++)
            {
                x[o][v] = model.Float(mini[v], maxi[v]);
            }
        }

        // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
        LSExpression[] d = new LSExpression[nbObservations];
        for (int o = 0; o < nbObservations; o++)
        {
            d[o] = model.Min();
            for (int c = 0; c < k; c++)
            {
                LSExpression distanceCluster = model.Sum();
                for (int v = 0; v < nbVariables; v++)
                {
                    distanceCluster.AddOperand(model.Pow(x[c][v] - M[o][v], 2));
                }
                d[o].AddOperand(distanceCluster);
            }
        }

        // Minimize the total distance
        obj = model.Sum(d);
        model.Minimize(obj);

        model.Close();

        // Parameterizes the solver.
        LSPhase phase = localsolver.CreatePhase();
        phase.SetTimeLimit(limit);

        localsolver.Solve();
    }

    // Writes the solution in a file in the following format:
    //  - objective value
    //  - k
    //  - for each cluster, the coordinates along each variable
    public void WriteSolution(string fileName)
    {
        using (StreamWriter output = new StreamWriter(fileName))
        {
            output.WriteLine(obj.GetDoubleValue());
            output.WriteLine(k);
            for (int c = 0; c < k; c++)
            {
                for (int v = 0; v < nbVariables; v++)
                {
                    output.Write(x[c][v].GetDoubleValue() + " ");
                }
                output.WriteLine();
            }
            output.Close();
        }
    }

    public static void Main(string[] args)
    {
        if (args.Length < 1)
        {
            Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
            Environment.Exit(1);
        }
        
        string instanceFile = args[0];
        string outputFile = args.Length > 1 ? args[1] : null;
        string strTimeLimit = args.Length > 2 ? args[2] : "5";
        string k = args.Length > 3 ? args[3] : "2";
        using (Kmeans model = new Kmeans(int.Parse(k)))
        {
            model.ReadInstance(instanceFile);
            model.Solve(int.Parse(strTimeLimit));
            if (outputFile != null)
            {
                model.WriteSolution(outputFile);
            }
        }
    }
}
Compilation / Execution (Windows)
javac Kmeans.java -cp %LS_HOME%\bin\localsolver.jar
java -cp %LS_HOME%\bin\localsolver.jar;. Kmeans instances\irit.dat
Compilation/Execution (Linux)
javac Kmeans.java -cp /opt/localsolver_XXX/bin/localsolver.jar
java -cp /opt/localsolver_XXX/bin/localsolver.jar:. Kmeans instances/irit.dat
/********** Kmeans.java **********/

import java.util.*;
import java.io.*;
import localsolver.*;

public class Kmeans {
    // Data properties
    private int nbObservations;
    private int nbVariables;
    private int k;

    private double[][] M;
    private String[] clusters;

    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    private double[] mini;
    private double[] maxi;

    // Solver.
    private LocalSolver localsolver;

    // Decisions.
    private LSExpression[][] x;

    // Objective.
    private LSExpression obj;

    private Kmeans(int k) {
        this.k = k;
    }

    // Reads instance data
    private void readInstance(String fileName) throws IOException {
        try (Scanner input = new Scanner(new File(fileName))) {
            input.useLocale(Locale.ROOT);

            nbObservations = input.nextInt();
            nbVariables = input.nextInt();

            M = new double[nbObservations][nbVariables];
            clusters = new String[nbObservations];
            mini = new double[nbVariables];
            maxi = new double[nbVariables];
            Arrays.fill(mini, Double.POSITIVE_INFINITY);
            Arrays.fill(maxi, Double.NEGATIVE_INFINITY);
            for (int o = 0; o < nbObservations; o++) {
                for (int v = 0; v < nbVariables; v++) {
                    M[o][v] = input.nextDouble();
                    mini[v] = Math.min(M[o][v], mini[v]);
                    maxi[v] = Math.max(M[o][v], maxi[v]);
                }
                clusters[o] = input.next();
            }
        }
    }

    private void solve(int limit) {
        // Declares the optimization model.
        localsolver = new LocalSolver();
        LSModel model = localsolver.getModel();

        // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
        x = new LSExpression[k][nbVariables];
        for (int c = 0; c < k; c++) {
            for (int v = 0; v < nbVariables; v++) {
                x[c][v] = model.floatVar(mini[v], maxi[v]);
            }
        }

        // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
        LSExpression[] d = new LSExpression[nbObservations];
        for (int o = 0; o < nbObservations; o++) {
            d[o] = model.min();
            for (int c = 0; c < k; c++) {
                LSExpression distanceCluster = model.sum();
                for (int v = 0; v < nbVariables; v++) {
                    distanceCluster.addOperand(model.pow(model.sub(x[c][v], M[o][v]), 2));
                }
                d[o].addOperand(distanceCluster);
            }
        }

        // Minimize the total distance
        obj = model.sum(d);
        model.minimize(obj);

        model.close();

        // Parameterizes the solver.
        LSPhase phase = localsolver.createPhase();
        phase.setTimeLimit(limit);

        localsolver.solve();
    }

    // Writes the solution in a file in the following format:
    // - objective value
    // - k
    // - for each cluster, the coordinates along each variable
    private void writeSolution(String fileName) throws IOException {
        try (PrintWriter output = new PrintWriter(fileName)) {
            output.println(obj.getDoubleValue());
            output.println(k);
            for (int c = 0; c < k; c++) {
                for (int v = 0; v < nbVariables; v++) {
                    output.print(x[c][v].getDoubleValue() + " ");
                }
                output.println();
            }
        }
    }

    public static void main(String[] args) {
        if (args.length < 1) {
            System.err.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
            System.exit(1);
        }

        String instanceFile = args[0];
        String outputFile = args.length > 1 ? args[1] : null;
        String strTimeLimit = args.length > 2 ? args[2] : "5";
        String k = args.length > 3 ? args[3] : "2";

        try {
            Kmeans model = new Kmeans(Integer.parseInt(k));
            model.readInstance(instanceFile);
            model.solve(Integer.parseInt(strTimeLimit));
            if (outputFile != null) {
                model.writeSolution(outputFile);
            }
        } catch (Exception ex) {
            System.err.println(ex);
            ex.printStackTrace();
            System.exit(1);
        }
    }
}