IDEA中环境配置和使用

TheBigBlue

2019-12-12

pom.xml文件配置

<!-- 声明公有的属性 -->
<properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <encoding>UTF-8</encoding>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.2.0</spark.version>
        <hadoop.version>2.7.1</hadoop.version>
        <scala.compat.version>2.11</scala.compat.version>
    </properties>
<!-- 声明并引入公有的依赖 -->
    <dependencies>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
      <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>${spark.version}</version>
    </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>

<!-- 配置构建信息 -->
    <build>
        <!-- 资源文件夹 -->
        <sourceDirectory>src/main/scala</sourceDirectory>
        <!-- 声明并引入构建的插件 -->
        <plugins>
             <!-- 用于编译Scala代码到class -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <!-- 程序打包 -->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                             <!-- 过滤掉以下文件，不打包 ：解决包重复引用导致的打包错误-->
                            <filters>
                                <filter><artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <!-- 打成可执行的jar包 的主方法入口-->
                                <transformer  implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass></mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

第一个WordCount

package SparkCore_01

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 第一个Spark程序
  */
object SparkWordCount {
  //Spark程序都需要使用main
  def main(args: Array[String]): Unit = {
      //0.构建系统环境变量,为了SparkContext加在环境变量所使用
      /*
      三个核心方法
       set(key,value) --> 主要应对的是 环境变量设置  key 环境变量名  value 是具体值
       setAppName(name) --> 设置程序运行的名称
       setMaster(执行方式),如果需要运行本地环境,那么就需要配置SetMaster这个值
       "local"  --> 代表本地模式,相当于启用一个线程来模拟Spark运行
       "local[数值]"  --> 代表本地模式, 根据数值来决定启用多少个线程来模拟spark运行
       ps:数值不能大于当前cpu 核心数
       "local[*]"  --> 代表本地模式 * 相当于是系统空闲多少线程就用多少线程来执行spark程序
       */
      val conf =new SparkConf().setAppName("SparkWordCount").setMaster("local")
     //1.先构建SparkContext对象,需要对SparkContext对象进行环境配置即将conf对象传入到SparkContext中
      val sc = new SparkContext(conf)

    //Spark对数据的处理
  //1.读取文件内容,参数是文件路径(多用于读取txt或log文件)
    val lines: RDD[String] = sc.textFile("dir/SparkCore_01/File.txt")
    //2.对文件中数据进行切分处理
    val words: RDD[String] = lines.flatMap(_.split(" "))
    //3.对单词进行统计之前,需要对单词的个数进行计数
    val tuples: RDD[(String, Int)] = words.map((_,1))
    //Spark中提供了一个根据key计算value的算子(这个算子是你使用最广泛一个算子),相同key为一组计算一次value的值
    val sumed: RDD[(String, Int)] = tuples.reduceByKey(_+_)

    //println(sumed.collect().toList)

   sc.stop()//关闭Sparkcontext


    //提交集群版本(修改位置):
    //sc.textFile(args(0)) //获取外部输入读取数据路径
    //将数据文件存储到集群(也可以存储在本地)没有返回值
     // sumed.saveAsTextFile(args(1)) // 获取外部输入的存储路径 ,不要打印语句
  }
}

程序打包提交集群

IDEA中环境配置和使用

将jar包上传到对应节点,然后在Spark安装目录下bin目录下执行以下操作
./spark-submit \
> --class SparkCore_01.SparkWordCount\
> --master spark://hadoop01:7077 \
> --executor-memory 512m \
> --total-executor-cores 2 \
> /root/Spark_1905-1.0-SNAPSHOT.jar hdfs://hadoop01:8020/word.txt hdfs://hadoop01:8020/out2
ps: jar包所在路径 hdfs集群读取文件写入到hdfs集群中

去掉打印日志

log4j.properties

# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Set everything to be logged to the console
# 修改此处更改显示信息级别
log4j.rootCategory=ERROR, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

IDEA中环境配置和使用