1 K-均聚類算法的基本思想 K-均聚類算法 是著名的劃分聚類分割方法。劃分方法的基本思想是:給定一個有N個元組或者紀錄的數據集,分裂法將構造K個分組,每一個分組就代表一個聚類,KN。而且這K個分組滿足下列條件:(1) 每一個分組至少包含一個數據紀錄;(
K-均值聚類算法是著名的劃分聚類分割方法。劃分方法的基本思想是:給定一個有N個元組或者紀錄的數據集,分裂法將構造K個分組,每一個分組就代表一個聚類,K K-means算法的工作原理:算法首先隨機從數據集中選取 K個點作為初始聚類中心,然后計算各個樣本到聚類中心的距離,把樣本歸到離它最近的那個聚類中心所在的類。計算新形成的每一個聚類的數據對象的平均值來得到新的聚類中心,如果相鄰兩次的聚類中心沒有任何變化,說明樣本調整結束,聚類準則函數 已經收斂。本算法的一個特點是在每次迭代中都要考察每個樣本的分類是否正確。若不正確,就要調整,在全部樣本調整完后,再修改聚類中心,進入下一次迭代。這個過程將不斷重復直到滿足某個終止條件,終止條件可以是以下任何一個: (1)沒有對象被重新分配給不同的聚類。 (2)聚類中心再發(fā)生變化。 (3)誤差平方和局部最小。 K-means聚類算法的一般步驟: (1)從 n個數據對象任意選擇 k 個對象作為初始聚類中心; (2)循環(huán)(3)到(4)直到每個聚類不再發(fā)生變化為止; (3)根據每個聚類對象的均值(中心對象),計算每個對象與這些中心對象的距離;并根據最小距離重新對相應對象進行劃分; (4)重新計算每個(有變化)聚類的均值(中心對象),直到聚類中心不再變化。這種劃分使得下式最小 K-均值聚類法的缺點: (1)在 K-means 算法中 K 是事先給定的,這個 K 值的選定是非常難以估計的。 (2)在 K-means 算法中,首先需要根據初始聚類中心來確定一個初始劃分,然后對初始劃分進行優(yōu)化。 (3) K-means算法需要不斷地進行樣本分類調整不斷地計算調整后的新的聚類中心因此當數據量非常大時算法的時間開銷是非常大的。 (4)K-means算法對一些離散點和初始k值敏感,不同的距離初始值對同樣的數據樣本可能得到不同的結果。
CV_IMPL int
cvKMeans2( const CvArr* _samples, intcluster_count, CvArr* _labels,
CvTermCriteria termcrit, int attempts, CvRNG*,
intflags, CvArr* _centers, double* _compactness )
_samples:輸入樣本的浮點矩陣,每個樣本一行,如對彩色圖像進行聚類,每個通道一行,CV_32FC3
cluster_count:所給定的聚類數目
_labels:輸出整數向量:每個樣本對應的類別標識,其范圍為0- (cluster_count-1),必須滿足以下條件:
cv::Mat data = cv::cvarrToMat(_samples),labels = cv::cvarrToMat(_labels);
CV_Assert(labels.isContinuous() && labels.type() == CV_32S &&
(labels.cols == 1 || labels.rows == 1)&&
labels.cols + labels.rows - 1 ==data.rows );
termcrit:指定聚類的最大迭代次數和/或精度(兩次迭代引起的聚類中心的移動距離),其執(zhí)行 k-means 算法搜索 cluster_count 個類別的中心并對樣本進行分類,輸出 labels(i) 為樣本i的類別標識。其中CvTermCriteria為OpenCV中的迭代算法的終止準則,其結構如下:
#define CV_TERMCRIT_ITER 1
#define CV_TERMCRIT_NUMBER CV_TERMCRIT_ITER
#define CV_TERMCRIT_EPS 2
typedef struct CvTermCriteria
{
int type; int max_iter; double epsilon;
} CvTermCriteria;
max_iter:最大迭代次數。 epsilon:結果的精確性 。
attempts:
flags: 與labels和centers相關
_centers: 輸出聚類中心,可以不用設置輸出聚類中心,但如果想輸出聚類中心必須滿足以下條件:
CV_Assert(!centers.empty() );
CV_Assert( centers.rows == cluster_count );
CV_Assert( centers.cols ==data.cols );
CV_Assert( centers.depth() == data.depth() );
聚類中心的獲得方式:(以三類為例)
double cent0 = centers->data.fl[0];
double cent1 = centers->data.fl[1];
double cent2 = centers->data.fl[2];
CV_IMPL int
cvKMeans2( const CvArr* _samples,int cluster_count,CvArr* _labels,
CvTermCriteriatermcrit, intattempts, CvRNG*,
int flags, CvArr* _centers, double* _compactness )
{
cv::Mat data = cv::cvarrToMat(_samples), labels= cv::cvarrToMat(_labels), centers;
if( _centers )
{
centers= cv::cvarrToMat(_centers);
// 將centers和data轉換為行向量
centers= centers.reshape(1);
data= data.reshape(1);
// centers必須滿足的條件
CV_Assert(!centers.empty());
CV_Assert(centers.rows== cluster_count );
CV_Assert(centers.cols== data.cols);
CV_Assert(centers.depth()== data.depth());
}
// labels必須滿足的條件
CV_Assert(labels.isContinuous()&& labels.type()== CV_32S &&
(labels.cols == 1 || labels.rows == 1) &&
labels.cols + labels.rows - 1 == data.rows );
// 調用kmeans實現聚類,如果定義了輸出聚類中心矩陣,那么輸出centers
double compactness = cv::kmeans(data, cluster_count, labels,termcrit, attempts,
flags, _centers? cv::_OutputArray(centers) : cv::_OutputArray() );
if( _compactness )
*_compactness= compactness;
return 1;
}
double cv::kmeans( InputArray_data, int K,
InputOutputArray_bestLabels,
TermCriteriacriteria, intattempts,
intflags, OutputArray_centers )
{
const int SPP_TRIALS =3;
Mat data = _data.getMat();
// 判斷data是否為行向量
bool isrow = data.rows == 1 && data.channels() > 1;
int N = !isrow ? data.rows : data.cols;
int dims = (!isrow? data.cols: 1)*data.channels();
int type = data.depth();
attempts= std::max(attempts, 1);
CV_Assert(data.dims<= 2 && type == CV_32F && K> 0 );
CV_Assert(N >= K);
_bestLabels.create(N, 1, CV_32S, -1, true);
Mat _labels, best_labels = _bestLabels.getMat();
// 使用已初始化的labels
if( flags & CV_KMEANS_USE_INITIAL_LABELS)
{
CV_Assert((best_labels.cols== 1 || best_labels.rows== 1) &&
best_labels.cols*best_labels.rows == N&&
best_labels.type() == CV_32S&&
best_labels.isContinuous());
best_labels.copyTo(_labels);
}
else
{
if( !((best_labels.cols== 1 || best_labels.rows== 1) &&
best_labels.cols*best_labels.rows == N&&
best_labels.type() == CV_32S&&
best_labels.isContinuous()))
best_labels.create(N, 1, CV_32S);
_labels.create(best_labels.size(), best_labels.type());
}
int*
labels = _labels.ptr
Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);
vector
vector
Vec2f* box = &_box[0];
double best_compactness = DBL_MAX,compactness = 0;
RNG&rng = theRNG();
int a, iter, i, j, k;
// 對終止條件進行修改
if( criteria.type& TermCriteria::EPS)
criteria.epsilon = std::max(criteria.epsilon, 0.);
else
criteria.epsilon = FLT_EPSILON;
criteria.epsilon *= criteria.epsilon;
if( criteria.type& TermCriteria::COUNT)
criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
else
criteria.maxCount = 100;
// 聚類數目為1類的時候
if( K == 1 )
{
attempts= 1;
criteria.maxCount = 2;
}
const
float* sample =
data.ptr
for( j = 0; j < dims; j++ )
box[j] = Vec2f(sample[j], sample[j]);
for( i = 1; i < N; i++ )
{
sample=
data.ptr
for( j = 0; j < dims; j++ )
{
floatv = sample[j];
box[j][0] = std::min(box[j][0], v);
box[j][1] = std::max(box[j][1], v);
}
}
for( a = 0; a < attempts; a++ )
{
double max_center_shift = DBL_MAX;
for( iter = 0;; )
{
swap(centers, old_centers);
/*enum
{
KMEANS_RANDOM_CENTERS=0, // Chooses random centers for k-Meansinitialization
KMEANS_PP_CENTERS=2, // Usesk-Means++ algorithm for initialization
KMEANS_USE_INITIAL_LABELS=1 // Uses the user-provided labels for K-Meansinitialization
};*/
if(iter == 0 && (a > 0 || !(flags& KMEANS_USE_INITIAL_LABELS)) )
{
if(flags & KMEANS_PP_CENTERS)
generateCentersPP(data, centers, K, rng, SPP_TRIALS);
else
{
for(k = 0; k< K; k++)
generateRandomCenter(_box,
centers.ptr
}
}
else
{
if(iter == 0 && a == 0 && (flags& KMEANS_USE_INITIAL_LABELS) )
{
for(i = 0; i< N; i++)
CV_Assert( (unsigned)labels[i] <(unsigned)K);
}
//compute centers
centers= Scalar(0);
for(k = 0; k< K; k++)
counters[k] = 0;
for(i = 0; i< N; i++)
{
sample=
data.ptr
k= labels[i];
float*center =
centers.ptr
j=0;
#ifCV_ENABLE_UNROLLED
for(;j <= dims- 4; j += 4 )
{
float t0 = center[j] + sample[j];
float t1 = center[j+1] + sample[j+1];
center[j] = t0;
center[j+1] = t1;
t0 = center[j+2] + sample[j+2];
t1 = center[j+3] + sample[j+3];
center[j+2] = t0;
center[j+3] = t1;
}
#endif
for(; j < dims;j++ )
center[j] += sample[j];
counters[k]++;
}
if(iter > 0 )
max_center_shift= 0;
for(k = 0; k< K; k++)
{
if(counters[k]!= 0 )
continue;
// if somecluster appeared to be empty then:
// 1. find the biggest cluster
// 2. find the farthest from the center pointin the biggest cluster
// 3. exclude the farthest point from thebiggest cluster and form a new 1-point cluster.
intmax_k = 0;
for(int k1 = 1; k1 < K; k1++ )
{
if( counters[max_k] < counters[k1] )
max_k = k1;
}
doublemax_dist = 0;
intfarthest_i = -1;
float*new_center =
centers.ptr
float*old_center =
centers.ptr
float*_old_center =
temp.ptr
floatscale = 1.f/counters[max_k];
for(j = 0; j< dims; j++)
_old_center[j] = old_center[j]*scale;
for(i = 0; i< N; i++)
{
if(labels[i]!= max_k )
continue;
sample =
data.ptr
double dist = normL2Sqr_(sample,_old_center, dims);
if( max_dist <= dist )
{
max_dist = dist;
farthest_i = i;
}
}
counters[max_k]--;
counters[k]++;
labels[farthest_i] = k;
sample=
data.ptr
for(j = 0; j< dims; j++)
{
old_center[j] -= sample[j];
new_center[j] += sample[j];
}
}
for(k = 0; k< K; k++)
{
float*center =
centers.ptr
CV_Assert(counters[k]!= 0 );
floatscale = 1.f/counters[k];
for(j = 0; j< dims; j++)
center[j] *= scale;
if(iter > 0 )
{
double dist = 0;
const
float* old_center =
old_centers.ptr
for( j = 0; j < dims; j++ )
{
double t = center[j] - old_center[j];
dist += t*t;
}
max_center_shift= std::max(max_center_shift, dist);
}
}
}
if(++iter == MAX(criteria.maxCount,2) || max_center_shift <= criteria.epsilon)
break;
// assignlabels
Matdists(1, N,CV_64F);
double*dist =
dists.ptr
parallel_for_(Range(0, N),
KMeansDistanceComputer(dist,labels, data,centers));
compactness= 0;
for(i = 0; i< N; i++)
{
compactness+= dist[i];
}
}
if( compactness < best_compactness)
{
best_compactness= compactness;
if(_centers.needed())
centers.copyTo(_centers);
_labels.copyTo(best_labels);
}
}
return best_compactness;
}
//灰度圖像聚類分析
BOOL GrayImageSegmentByKMeans2(const IplImage* pImg, IplImage*pResult, intsortFlag)
{
assert(pImg != NULL&& pImg->nChannels== 1);
//創(chuàng)建樣本矩陣,CV_32FC1代表位浮點通道(灰度圖像)
CvMat*samples = cvCreateMat((pImg->width)* (pImg->height),1, CV_32FC1);
//創(chuàng)建類別標記矩陣,CV_32SF1代表位整型通道
CvMat*clusters = cvCreateMat((pImg->width)* (pImg->height),1, CV_32SC1);
//創(chuàng)建類別中心矩陣
CvMat*centers = cvCreateMat(nClusters, 1, CV_32FC1);
// 將原始圖像轉換到樣本矩陣
{
intk = 0;
CvScalars;
for(int i = 0; i < pImg->width; i++)
{
for(int j=0;j < pImg->height; j++)
{
s.val[0] = (float)cvGet2D(pImg, j, i).val[0];
cvSet2D(samples,k++, 0, s);
}
}
}
//開始聚類,迭代次,終止誤差.0
cvKMeans2(samples, nClusters,clusters, cvTermCriteria(CV_TERMCRIT_ITER + CV_TERMCRIT_EPS,100, 1.0), 1, 0, 0, centers);
// 無需排序直接輸出時
if (sortFlag == 0)
{
intk = 0;
intval = 0;
floatstep = 255 / ((float)nClusters - 1);
CvScalars;
for(int i = 0; i < pImg->width; i++)
{
for(int j = 0;j < pImg->height; j++)
{
val = (int)clusters->data.i[k++];
s.val[0] = 255- val * step;//這個是將不同類別取不同的像素值,
cvSet2D(pResult,j, i, s); //將每個像素點賦值
}
}
returnTRUE;
}
BOOL ColorImageSegmentByKMeans2(const IplImage* img, IplImage* pResult, int sortFlag)
{
assert(img != NULL&& pResult != NULL);
assert(img->nChannels== 3 && pResult->nChannels == 1);
int i,j;
CvMat*samples=cvCreateMat((img->width)*(img->height),1,CV_32FC3);//創(chuàng)建樣本矩陣,CV_32FC3代表位浮點通道(彩色圖像)
CvMat*clusters=cvCreateMat((img->width)*(img->height),1,CV_32SC1);//創(chuàng)建類別標記矩陣,CV_32SF1代表位整型通道
int k=0;
for (i=0;i
{
for(j=0;j
{
CvScalars;
//獲取圖像各個像素點的三通道值(RGB)
s.val[0]=(float)cvGet2D(img,j,i).val[0];
s.val[1]=(float)cvGet2D(img,j,i).val[1];
s.val[2]=(float)cvGet2D(img,j,i).val[2];
cvSet2D(samples,k++,0,s);//將像素點三通道的值按順序排入樣本矩陣
}
}
cvKMeans2(samples,nClusters,clusters,cvTermCriteria(CV_TERMCRIT_ITER,100,1.0));//開始聚類,迭代次,終止誤差.0
k=0;
int val=0;
float step=255/(nClusters-1);
for (i=0;i
{
for(j=0;j
{
val=(int)clusters->data.i[k++];
CvScalars;
s.val[0]=255-val*step;//這個是將不同類別取不同的像素值,
cvSet2D(pResult,j,i,s); //將每個像素點賦值
}
}
cvReleaseMat(&samples);
cvReleaseMat(&clusters);
return TRUE;
}
聲明:本網頁內容旨在傳播知識,若有侵權等問題請及時與本網聯系,我們將在第一時間刪除處理。TEL:177 7030 7066 E-MAIL:11247931@qq.com