# K-means¶

## Principles learned¶

• Define the actual set of decision variables
• Define the float bounds from the data
• Use of non-linearities: minimizing a sum of “min”

## Problem¶

Given a sample of n observations and a set of quantitative variables, the goal is to partition these observations into k clusters. Clusters are defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.

## Data¶

The format of the data is as follows:

• 1st line: number of observations and number of variables
• For each observation: coordinate along each variable, and the actual cluster it belongs to

## Program¶

The only decision variables are `x[c][v]` to define the coordinate of the center of gravity of cluster c along variable v. There is no need for variables to determine which cluster an observation belongs to: the distance with the closest cluster is computed thanks to the min operator. The objective is to minimize the sum of these distances.

Execution:
localsolver kmeans.lsp inFileName=instances/irit.dat [lsTimeLimit=] [solFileName=] [k=val]
```/********** kmeans.lsp **********/
use io;

function input(){
usage = "\nUsage: localsolver kmeans.lsp "
+ "inFileName=inputFile [solFileName=outputFile] [lsTimeLimit=timeLimit] [k=value]\n";

if (inFileName == nil) throw usage;

if (k == nil){
k = 2;
}

for[o in 1..nbObservations]{
for[v in 1..nbVariables]{
}
}

// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
for[v in 1..nbVariables]{
mini[v] = min[o in 1..nbObservations](M[o][v]);
maxi[v] = max[o in 1..nbObservations](M[o][v]);
}
}

/* Declares the optimization model. */
function model(){
// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x[1..k][v in 1..nbVariables] <- float(mini[v], maxi[v]);

// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
d[o in 1..nbObservations] <- min[c in 1..k](sum[v in 1..nbVariables](pow(x[c][v] - M[o][v], 2)));

// Minimize the total distance
obj <- sum[o in 1..nbObservations](d[o]);
minimize obj;
}

/* Parameterizes the solver. */
function param(){
if(lsTimeLimit == nil) lsTimeLimit=5;
}

/* Writes the solution in a file in the following format:
*  - objective value
*  - k
*  - for each cluster, the coordinates along each variable
*/
function output(){
if(solFileName == nil) return;
local solFile = io.openWrite(solFileName);
solFile.println(obj.value);
solFile.println(k);
for[c in 1..k] {
for[v in 1..nbVariables] {
solFile.print(x[c][v].value + " ");
}
solFile.println();
}
}
```
Execution (Windows)
set PYTHONPATH=%LS_HOME%\bin\python27\
python kmeans.py instances\irit.dat
Execution (Linux)
export PYTHONPATH=/opt/localsolver_XXX/bin/python27/
python kmeans.py instances/irit.dat
```########## kmeans.py ##########

import localsolver
import sys

if len(sys.argv) < 2:
print ("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
sys.exit(1)

with open(filename) as f:
return [str(elem) for elem in f.read().split()]

with localsolver.LocalSolver() as ls:

#
#
if len(sys.argv) > 4: k = int(sys.argv[4])
else: k = 2

# Data properties
nbObservations = int(file_it.next())
nbVariables = int(file_it.next())

M = [None]*nbObservations
qualitative = [None]*nbObservations
for o in range(nbObservations):
M[o] = [None]*(nbVariables)
for v in range(nbVariables):
M[o][v] = float(file_it.next())
qualitative[o] = file_it.next()

# mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
mini = [min(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]
maxi = [max(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]

#
# Declares the optimization model
#
model = ls.model

# x[o][v] are decision variables equal to the center of gravity of the kth group for the jit variable
x = [[model.float(mini[v], maxi[v]) for v in range(nbVariables)] for c in range(k)]

# d[o] is the distance between observation o and the nearest cluster (its center of gravity)
d = [model.min(model.sum((x[c][v] - M[o][v])**2 for v in range(nbVariables)) for c in range(k))
for o in range(nbObservations)]

# Minimize the total distance
obj = model.sum(d)
model.minimize(obj)

model.close()

#
# Parameterizes the solver
#
if len(sys.argv) > 3: ls.create_phase().time_limit = int(sys.argv[3])
else: ls.create_phase().time_limit = 5

ls.solve()

#
# Writes the solution in a file in the following format:
#  - objective value
#  - k
#  - for each cluster, the coordinates along each variable
#
if len(sys.argv) > 2:
with open(sys.argv[2], 'w') as f:
f.write("%f\n" % obj.value)
f.write("%d\n" % k)
for c in range(k):
for v in range(nbVariables):
f.write("%f " % x[c][v].value)
f.write("\n")
```
Compilation / Execution (Windows)
cl /EHsc kmeans.cpp -I%LS_HOME%\include /link %LS_HOME%\bin\localsolver.dll.lib
kmeans instances\irit.dat
Compilation / Execution (Linux)
g++ kmeans.cpp -I/opt/localsolver_XXX/include -llocalsolver -lpthread -o kmeans
./kmeans instances/irit.dat
```//********* kmeans.cpp *********

#include <iostream>
#include <fstream>
#include <vector>
#include <limits>
#include "localsolver.h"

using namespace localsolver;
using namespace std;

class Kmeans {
public:
// Data properties
int nbObservations;
int nbVariables;
int k;

vector<vector<lsdouble> > M;
vector<string> clusters;

// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
vector<lsdouble> mini;
vector<lsdouble> maxi;

// Solver.
LocalSolver localsolver;

// Decisions
vector<vector<LSExpression> > x;

// Objective
LSExpression obj;

// Constructor
Kmeans(int k) : k(k) { }

ifstream infile;
infile.open(fileName.c_str());

infile >> nbObservations;
infile >> nbVariables;

M.resize(nbObservations);
clusters.resize(nbObservations);
mini.resize(nbVariables, numeric_limits<lsdouble>::infinity());
maxi.resize(nbVariables, -numeric_limits<lsdouble>::infinity());
for (int o = 0; o < nbObservations; o++) {
M[o].resize(nbVariables);
for (int v = 0; v < nbVariables; v++) {
infile >> M[o][v];
if (M[o][v] < mini[v]) mini[v] = M[o][v];
if (M[o][v] > maxi[v]) maxi[v] = M[o][v];
}
infile >> clusters[o];
}
}

void solve(int limit) {
// Declares the optimization model.
LSModel model = localsolver.getModel();

// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x.resize(k);
for (int c = 0; c < k; c++) {
x[c].resize(nbVariables);
for (int v = 0; v < nbVariables; v++) {
x[c][v] = model.floatVar(mini[v], maxi[v]);
}
}

// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
vector<LSExpression> d(nbObservations);
for(int o = 0; o < nbObservations; o++) {
d[o] = model.min();
for(int c = 0; c < k; c++) {
LSExpression distanceCluster = model.sum();
for(int v = 0; v < nbVariables; v++) {
distanceCluster += model.pow(x[c][v] - M[o][v], 2);
}
}
}

// Minimize the total distance
obj = model.sum(d.begin(), d.end());
model.minimize(obj);

model.close();

// Parameterizes the solver.
LSPhase phase = localsolver.createPhase();
phase.setTimeLimit(limit);
localsolver.solve();
}

// Writes the solution in a file in the following format:
//  - objective value
//  - k
//  - for each cluster, the coordinates along each variable
void writeSolution(const string& fileName) {
ofstream outfile;
outfile.open(fileName.c_str());

outfile << obj.getDoubleValue() << endl;
outfile << k << endl;
for(int c = 0; c < k; c++) {
for(int v = 0; v < nbVariables; v++) {
outfile << x[c][v].getDoubleValue() << " ";
}
outfile << endl;
}
}
};

int main(int argc, char** argv) {
if (argc < 2) {
cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
return 1;
}

const char* instanceFile = argv[1];
const char* solFile = argc > 2 ? argv[2] : NULL;
const char* strTimeLimit = argc > 3 ? argv[3] : "5";
const char* k = argc > 4 ? argv[4] : "2";

try {
Kmeans model(atoi(k));
model.solve(atoi(strTimeLimit));
if(solFile != NULL) model.writeSolution(solFile);
return 0;
} catch (const exception& e){
cerr << "Error occurred: " << e.what() << endl;
return 1;
}
}
```
Compilation/Execution (Windows)
copy %LS_HOME%\bin\*net.dll .
csc Kmeans.cs /reference:localsolvernet.dll
Kmeans instances\irit.dat
```/********** kmeans.cs **********/

using System;
using System.IO;
using System.Globalization;
using localsolver;

public class Kmeans : IDisposable
{
// Data properties
int nbObservations;
int nbVariables;
int k;

double[][] M;
string[] clusters;

// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
double[] mini;
double[] maxi;

// Solver.
LocalSolver localsolver;

// Decisions.
LSExpression[][] x;

// Objective.
LSExpression obj;

public Kmeans(int k)
{
localsolver = new LocalSolver();
this.k = k;
}

{
{

nbObservations = int.Parse(splittedLine[0]);
nbVariables = int.Parse(splittedLine[1]);

mini = new double[nbVariables];
maxi = new double[nbVariables];
for (int v = 0; v < nbVariables; v++)
{
mini[v] = double.PositiveInfinity;
maxi[v] = double.NegativeInfinity;
}
M = new double[nbObservations][];;
clusters = new string[nbObservations];
for (int o = 0; o < nbObservations; o++)
{
M[o] = new double[nbVariables];
for (int v = 0; v < nbVariables; v++)
{
M[o][v] = double.Parse(splittedLine[v], CultureInfo.InvariantCulture);
mini[v] = Math.Min(M[o][v], mini[v]);
maxi[v] = Math.Max(M[o][v], maxi[v]);
}
clusters[o] = splittedLine[nbVariables];
}
}
}

public void Dispose()
{
if (localsolver != null)
localsolver.Dispose();
}

public void Solve(int limit)
{
// Declares the optimization model
LSModel model = localsolver.GetModel();

// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x = new LSExpression[k][];
for (int o = 0; o < k; o++)
{
x[o] = new LSExpression[nbVariables];
for (int v = 0; v < nbVariables; v++)
{
x[o][v] = model.Float(mini[v], maxi[v]);
}
}

// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
LSExpression[] d = new LSExpression[nbObservations];
for (int o = 0; o < nbObservations; o++)
{
d[o] = model.Min();
for (int c = 0; c < k; c++)
{
LSExpression distanceCluster = model.Sum();
for (int v = 0; v < nbVariables; v++)
{
}
}
}

// Minimize the total distance
obj = model.Sum(d);
model.Minimize(obj);

model.Close();

// Parameterizes the solver.
LSPhase phase = localsolver.CreatePhase();
phase.SetTimeLimit(limit);

localsolver.Solve();
}

// Writes the solution in a file in the following format:
//  - objective value
//  - k
//  - for each cluster, the coordinates along each variable
public void WriteSolution(string fileName)
{
using (StreamWriter output = new StreamWriter(fileName))
{
output.WriteLine(obj.GetDoubleValue());
output.WriteLine(k);
for (int c = 0; c < k; c++)
{
for (int v = 0; v < nbVariables; v++)
{
output.Write(x[c][v].GetDoubleValue() + " ");
}
output.WriteLine();
}
output.Close();
}
}

public static void Main(string[] args)
{
if (args.Length < 1)
{
Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
Environment.Exit(1);
}

string instanceFile = args[0];
string outputFile = args.Length > 1 ? args[1] : null;
string strTimeLimit = args.Length > 2 ? args[2] : "5";
string k = args.Length > 3 ? args[3] : "2";
using (Kmeans model = new Kmeans(int.Parse(k)))
{
model.Solve(int.Parse(strTimeLimit));
if (outputFile != null)
{
model.WriteSolution(outputFile);
}
}
}
}
```
Compilation / Execution (Windows)
javac Kmeans.java -cp %LS_HOME%\bin\localsolver.jar
java -cp %LS_HOME%\bin\localsolver.jar;. Kmeans instances\irit.dat
Compilation/Execution (Linux)
javac Kmeans.java -cp /opt/localsolver_XXX/bin/localsolver.jar
java -cp /opt/localsolver_XXX/bin/localsolver.jar:. Kmeans instances/irit.dat
```/********** Kmeans.java **********/

import java.util.*;
import java.io.*;
import localsolver.*;

public class Kmeans {
// Data properties
private int nbObservations;
private int nbVariables;
private int k;

private double[][] M;
private String[] clusters;

// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
private double[] mini;
private double[] maxi;

// Solver.
private LocalSolver localsolver;

// Decisions.
private LSExpression[][] x;

// Objective.
private LSExpression obj;

private Kmeans(int k) {
this.k = k;
}

private void readInstance(String fileName) throws IOException {
try (Scanner input = new Scanner(new File(fileName))) {
input.useLocale(Locale.ROOT);

nbObservations = input.nextInt();
nbVariables = input.nextInt();

M = new double[nbObservations][nbVariables];
clusters = new String[nbObservations];
mini = new double[nbVariables];
maxi = new double[nbVariables];
Arrays.fill(mini, Double.POSITIVE_INFINITY);
Arrays.fill(maxi, Double.NEGATIVE_INFINITY);
for (int o = 0; o < nbObservations; o++) {
for (int v = 0; v < nbVariables; v++) {
M[o][v] = input.nextDouble();
mini[v] = Math.min(M[o][v], mini[v]);
maxi[v] = Math.max(M[o][v], maxi[v]);
}
clusters[o] = input.next();
}
}
}

private void solve(int limit) {
// Declares the optimization model.
localsolver = new LocalSolver();
LSModel model = localsolver.getModel();

// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x = new LSExpression[k][nbVariables];
for (int c = 0; c < k; c++) {
for (int v = 0; v < nbVariables; v++) {
x[c][v] = model.floatVar(mini[v], maxi[v]);
}
}

// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
LSExpression[] d = new LSExpression[nbObservations];
for (int o = 0; o < nbObservations; o++) {
d[o] = model.min();
for (int c = 0; c < k; c++) {
LSExpression distanceCluster = model.sum();
for (int v = 0; v < nbVariables; v++) {
}
}
}

// Minimize the total distance
obj = model.sum(d);
model.minimize(obj);

model.close();

// Parameterizes the solver.
LSPhase phase = localsolver.createPhase();
phase.setTimeLimit(limit);

localsolver.solve();
}

// Writes the solution in a file in the following format:
// - objective value
// - k
// - for each cluster, the coordinates along each variable
private void writeSolution(String fileName) throws IOException {
try (PrintWriter output = new PrintWriter(fileName)) {
output.println(obj.getDoubleValue());
output.println(k);
for (int c = 0; c < k; c++) {
for (int v = 0; v < nbVariables; v++) {
output.print(x[c][v].getDoubleValue() + " ");
}
output.println();
}
}
}

public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
System.exit(1);
}

String instanceFile = args[0];
String outputFile = args.length > 1 ? args[1] : null;
String strTimeLimit = args.length > 2 ? args[2] : "5";
String k = args.length > 3 ? args[3] : "2";

try {
Kmeans model = new Kmeans(Integer.parseInt(k));