Spark getting started Idea remote submission project to spark cluster

1, Dependent package configuration

The related dependency packages of scala and spark. The number of versions underlined after spark package should be consistent with the first two digits of scala version, that is, 2.11

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.mk</groupId>
  <artifactId>spark-test</artifactId>
  <version>1.0</version>

  <name>spark-test</name>
  <url>http://spark.mk.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <scala.version>2.11.1</scala.version>
    <spark.version>2.4.4</spark.version>
    <hadoop.version>2.6.0</hadoop.version>
  </properties>

  <dependencies>
    <!-- scala rely on-->
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>

    <!-- spark rely on-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>


    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
  </dependencies>

  <build>
    <pluginManagement>
      <plugins>

        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>

        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

 

2, PI example

java rewrites the PI example of scala

package com.mk;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;

import java.util.ArrayList;
import java.util.List;



public class App 
{
    public static void main( String[] args )
    {


        SparkConf sparkConf = new SparkConf();
        if(System.getProperty("os.name").toLowerCase().contains("win")) {
//            sparkConf.setMaster("local[2]");
//            System.out.println("spark using local emulation");
//        }else
//            {
            sparkConf.setMaster("spark://hadoop01:7077,hadoop02:7077,hadoop03:7077");
            sparkConf.set("spark.driver.host","192.168.10.126");//Local ip must be able to access each other with spark cluster, such as the same LAN
            sparkConf.setJars(new String[] {".\\out\\artifacts\\spark_test\\spark-test.jar"});//Path of project build generation
        }
        SparkSession session = SparkSession.builder().appName("Pi").config(sparkConf).config(sparkConf).getOrCreate();
        int slices =2;
        int n = (int)Math.min(100_000L * slices, Integer.MAX_VALUE);
        JavaSparkContext sparkContext = new JavaSparkContext(session.sparkContext());

        List<Integer> list = new ArrayList<>(n);
        for (int i = 0; i < n; i++)
            list.add(i + 1);
        int count  = sparkContext.parallelize(list, slices).
                map(v -> {
                    double x = Math.random() * 2 - 1;
                    double y = Math.random() * 2 - 1;
                    if (x * x + y * y < 1)
                        return 1;
                    return 0;
                }).reduce((Integer a, Integer b) ->a+b);
         System.out.println("PI:"+  4.0 * count / n);
        session.stop();

    }
}

 

3, Run directly on idea

Output PI

 

 

4, Limitations

Note: the local ip of the project machine must be accessible to each other with the spark cluster, such as the same LAN.

Failed to submit when not in the same network. The task has been retried and cannot exit

359 original articles published, 523 praised, 1.28 million visitors+
His message board follow

Tags: Spark Maven Scala Apache

Posted on Thu, 30 Jan 2020 10:39:11 -0500 by max_power