数据挖掘中 分类 ,属性变量 如何变 double OHE

数据挖掘中,基本所有算法需求数据都是    二维 double 

1  如果是 二变量    一个变 0   一个 变 1
2  其他的 以 index: dimentionvalue  来编码,  每个维度中,每种value用 用一个维度表示
 
 
//将train_cat_rdd中的(特征ID:特征)去重,并进行编号
    var oheMap = train_cat_rdd.flatMap(x => x).distinct().zipWithIndex().collectAsMap()
    //oheMap: scala.collection.Map[(Int, String),Long] = Map((7,608511e9) -> 31527, (7,b2d8fbed) -> 42207,
    //  (7,1d3e2fdb) -> 52791
    println("Number of features")
    println(oheMap.size)

    // parse vetor
    oheMap.take(10).foreach(println)

    val parsesize = oheMap.size +  8
//    45790
//    ((0,8907c166),32600)
//    ((6,7b177be1),28570)
//    ((7,ae5eeb59),23866)
//    ((7,2be70f8c),41143)
//    ((7,105627d8),14562)
//    ((7,060acc61),21043)
//    ((7,a3234c93),7884)
//    ((7,1d3e2fdb),34934)
//    ((7,b2d8fbed),17166)
//    ((7,608511e9),44647)


// 45790 size parse  add

    val doubleSizeIndex = Array(0, 1, 2, 3, 4, 5, 6, 7)
    val demeansionValue = Array(1.0, 1.0, 1.0, 1.0,1.0, 1.0, 1.0, 1.0, 1.0)

    //create OHE for train data
    val ohe_train_rdd = train_rdd.map{ case (key, cateorical_features, numerical_features) =>
      val cat_features_indexed = parseCatFeatures(cateorical_features)
      val cat_feature_ohe = new ArrayBuffer[Int]
      for (k <- cat_features_indexed) {
        if(oheMap contains k){
          cat_feature_ohe += (oheMap get (k)).get.toInt
        }else {
          cat_feature_ohe += 0
        }
      }
      val numerical_features_dbl  = numerical_features.map{
        x =>
          val x1 = if (x.toInt < 0) "0" else x
          x1.toDouble
      }


     val vs = Vectors.sparse(parsesize, doubleSizeIndex ++ cat_feature_ohe, numerical_features_dbl ++ demeansionValue) //建立稀疏向量

     // val vs = Vectors.sparse(parsesize, doubleSizeIndex , numerical_features_dbl ) //建立稀疏向量



    // var features =  numerical_features_dbl



     // LabeledPoint(key.split("::")(1).toInt,vs)

      LabeledPoint(key.split("::")(1).toInt,vs.toDense)

    }
    ohe_train_rdd.cache()
    ohe_train_rdd.take(10).foreach(println)