Flink stream processing API for big data

1,Environment

1.1 getExecutionEnvironment

Create an execution environment that represents the context of the current executing program. If the program is called independently, this method returns to the local execution environment; If the program is called from the command-line client to submit to the cluster, this method returns the execution environment of the cluster, that is, getExecutionEnvironment determines the returned operation environment according to the query operation mode, which is the most commonly used way to create the execution environment.

ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

If the parallelism is not set, the configuration in flink-conf.yaml will prevail, and the default is 1.

1.2 createLocalEnvironment

To return to the local execution environment, you need to specify the default parallelism when calling.

LocalStreamEnvironment localEnvironment = StreamExecutionEnvironment.createLocalEnvironment(1);

1.3 createRemoteEnvironment

Return to the cluster execution environment and submit the Jar to the remote server. You need to specify the IP and port number of the JobManager when calling, and specify the Jar package to run in the cluster.

StreamExecutionEnvironment remoteEnvironment = StreamExecutionEnvironment.createRemoteEnvironment("remote_host", 8888, 1);

2,Source

two point one   Read data from collection

package application;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Arrays;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 9:33 morning
 * @Version: 1.0
 * @Description:
 */
public class FlinkFromCollection {

    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStream<Integer> integerDataStream = env.fromCollection(Arrays.asList(1, 4, 3, 7, 8, 9));
        
        integerDataStream.print();
        
        env.execute();
    }
}

2.2 reading data from files

package application.function;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 11:25 morning
 * @Version: 1.0
 * @Description:
 */
public class FlinkFromFile {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        
        String inputPath = "/Users/tlzs/Desktop/working/IdeaProjects/Flink/src/main/resources/words.txt";
        DataStream<String> inputDataStream = env.readTextFile(inputPath);
        
        inputDataStream.print();
        
        env.execute();
    }
}

2.3 take the data of kafka message queue as the source

Dependency of kafka connector to be introduced:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>Flink</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
    </dependencies>

</project>
./kafka-console-producer.sh --broker-list aiops02:6667 --topic sensor
package application;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import java.util.Properties;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 10:36 morning
 * @Version: 1.0
 * @Description:
 */
public class FlinkFromKafka {

    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "192.168.48.12:6667");
        properties.setProperty("group.id", "consumer-group");
        properties.setProperty("key.deserializer",
                "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("value.deserializer",
                "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("auto.offset.reset", "latest");
        DataStream<String> inputDataStream = env.addSource(new FlinkKafkaConsumer011<String>("sensor",new SimpleStringSchema(),properties));

        inputDataStream.print();
        env.execute();
    }
}

2.4 custom Source

In addition to the above source data sources, we can also customize the source. All you need to do is pass in a SourceFunction. The specific calls are as follows:

DataStreamSource<SensorReading> inputDataStream = env.addSource(new MySensorSource());

We hope to randomly generate sensor data. The specific code of MySensorSource is as follows:

package application.application.entity;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 11:39 morning
 * @Version: 1.0
 * @Description:
 */
public class SensorReading {
    /**
     * sensor id
     */
    private String sensorId;
    /**
     * Timestamp of the current time
     */
    private long currentMillis;
    /**
     * sensor Monitored values
     */
    private double value;

    public SensorReading() {
    }

    public SensorReading(String sensorId, long currentMillis, double value) {
        this.sensorId = sensorId;
        this.currentMillis = currentMillis;
        this.value = value;
    }

    public String getSensorId() {
        return sensorId;
    }

    public void setSensorId(String sensorId) {
        this.sensorId = sensorId;
    }

    public long getCurrentMillis() {
        return currentMillis;
    }

    public void setCurrentMillis(long currentMillis) {
        this.currentMillis = currentMillis;
    }

    public double getValue() {
        return value;
    }

    public void setValue(double value) {
        this.value = value;
    }

    @Override
    public String toString() {
        return "SensorReading{" +
                "sensorId='" + sensorId + '\'' +
                ", currentMillis=" + currentMillis +
                ", value=" + value +
                '}';
    }
}
package application;

import application.application.entity.SensorReading;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 11:38 morning
 * @Version: 1.0
 * @Description:
 */
public class FlinkMySource {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<SensorReading> inputDataStream = env.addSource(new MySensorSource());

        inputDataStream.print();

        env.execute();
    }


    public static class MySensorSource implements SourceFunction<SensorReading>{

        private volatile boolean running = true;

        @Override
        public void run(SourceContext<SensorReading> sourceContext) throws Exception {
            Random random = new Random();
            Map<String,Double> sensorMap = new HashMap<>();
            for (int i = 0; i < 10; i++) {
                sensorMap.put("sensor_" + (i+1),random.nextGaussian());
            }
            while (running){
                for (String sensorId : sensorMap.keySet()){
                    double value = sensorMap.get(sensorId) + random.nextGaussian();
                    sensorMap.put(sensorId,value);
                    sourceContext.collect(new SensorReading(sensorId,System.currentTimeMillis(),value));
                }

                TimeUnit.SECONDS.sleep(1);
            }
        }

        @Override
        public void cancel() {
            running = false;
        }
    }
}

3,Transform

Conversion operator

3.1 map

DataStream<Integer> inputDataStream = env.fromCollection(Arrays.asList(1, 4, 3, 8, 0, 9, 10, 6));
        DataStream<Boolean> mapDataStream = inputDataStream.map(new MapFunction<Integer, Boolean>() {
            @Override
            public Boolean map(Integer i) throws Exception {
                return i > 5;
            }
        });

3.2 flatMap

DataStream<String> inputDataStream = env.fromCollection(Arrays.asList(
                "hello world",
                "how are you",
                "hello kitty"
        ));
        DataStream<Tuple2<String,Integer>> flatMapDataStream = inputDataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] words = line.split(" ");
                Arrays.asList(words).forEach(word->{
                    collector.collect(new Tuple2<>(word,1));
                });
            }
        });

3.3 Filter

 

DataStream<String> inputDataStream = env.fromCollection(Arrays.asList(
                "hello world",
                "how are you",
                "hello kitty"
        ));
        DataStream<Tuple2<String,Integer>> flatMapDataStream = inputDataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] words = line.split(" ");
                Arrays.asList(words).forEach(word->{
                    collector.collect(new Tuple2<>(word,1));
                });
            }
        });

        DataStream<Tuple2<String, Integer>> filterDataStream = flatMapDataStream.filter(new FilterFunction<Tuple2<String, Integer>>() {
            @Override
            public boolean filter(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
                return stringIntegerTuple2.f0.length() > 3;
            }
        });

3.4 KeyBy

DataStream → KeyedStream: logically split a stream into disjoint partitions. Each partition contains elements with the same key and is implemented internally in the form of hash.

sensor.txt:

sensor_01,1632814080332,3.02
sensor_03,1632814080345,3.12
sensor_05,1632814081236,2.32
sensor_06,1632814081234,3.57
sensor_01,1632814087652,3.38
sensor_01,1632814080389,3.99
package application;

import application.application.entity.SensorReading;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 4:24 afternoon
 * @Version: 1.0
 * @Description:
 */
public class FlinkTransformAggregation {

    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String inputPath = "/Users/tlzs/Desktop/working/IdeaProjects/Flink/src/main/resources/sensor.txt";
        DataStream<String> inputDataStream = env.readTextFile(inputPath);

        DataStream<SensorReading> sensorReadingDataStream = inputDataStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0],Long.parseLong(fields[1]),Double.parseDouble(fields[2]));
        });

        KeyedStream<SensorReading, String> keyedStream = sensorReadingDataStream.keyBy(SensorReading::getSensorId);

//        DataStream<SensorReading> maxBy = keyedStream.max("value");
        //This method refreshes the object
        DataStream<SensorReading> maxByDataStream = keyedStream.maxBy("value");

        maxByDataStream.print();

        env.execute();
    }
}

  3.5 Rolling Aggregation

These operators can aggregate each tributary of KeyedStream.

  • sum()
  • min()
  • max()
  • minBy()
  • maxBy()

3.6 Reduce

KeyedStream → DataStream: an aggregation operation of grouped data streams, which combines the current element and the results of the last aggregation to generate a new value. The returned stream contains the results of each aggregation, rather than only the final results of the last aggregation.

package application;

import application.application.entity.SensorReading;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 5:22 afternoon
 * @Version: 1.0
 * @Description:
 */
public class FlinkTransformAggregationReduce {

    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String inputPath = "/Users/tlzs/Desktop/working/IdeaProjects/Flink/src/main/resources/sensor.txt";
        DataStream<String> inputDataStream = env.readTextFile(inputPath);

        DataStream<SensorReading> sensorReadingDataStream = inputDataStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0],Long.parseLong(fields[1]),Double.parseDouble(fields[2]));
        });

        //
        KeyedStream<SensorReading, String> keyedStream = sensorReadingDataStream.keyBy(SensorReading::getSensorId);

//        DataStream<SensorReading> maxBy = keyedStream.max("value");
        //This method refreshes the object
        DataStream<SensorReading> reduceDataStream = keyedStream.reduce(new ReduceFunction<SensorReading>() {
            @Override
            public SensorReading reduce(SensorReading s1, SensorReading s2) throws Exception {
                return new SensorReading(s1.getSensorId(),s1.getValue() > s2.getValue() ? s1.getCurrentMillis(): s2.getCurrentMillis(),Math.max(s1.getValue(),s2.getValue()));
            }
        });

        reduceDataStream.print();

        env.execute();
    }
}

3.7 Split and Select

Split

DataStream → SplitStream: split a DataStream into two or more datastreams according to some characteristics.  

Select

SplitStream → DataStream: get one or more datastreams from a SplitStream.


Requirements: the sensor data is divided into two streams according to the temperature (bounded by 30 degrees).  

 

package application;

import application.application.entity.SensorReading;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Collections;

/**
 * @author: wtl
 * @License: (C) Copyright 2021, wtl Corporation Limited.
 * @Contact: 1050100468@qq.com
 * @Date: 2021/9/28 5:46 afternoon
 * @Version: 1.0
 * @Description:
 */
public class FlinkTransformSplitSelect {

    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String inputPath = "/Users/tlzs/Desktop/working/IdeaProjects/Flink/src/main/resources/sensor.txt";
        DataStream<String> inputDataStream = env.readTextFile(inputPath);

        DataStream<SensorReading> sensorReadingDataStream = inputDataStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0],Long.parseLong(fields[1]),Double.parseDouble(fields[2]));
        });

        //shunt
        SplitStream<SensorReading> splitStream = sensorReadingDataStream.split(new OutputSelector<SensorReading>() {
            @Override
            public Iterable<String> select(SensorReading sensorReading) {
                return sensorReading.getValue() > 33 ? Collections.singletonList("high") : Collections.singletonList("low");
            }
        });

        DataStream<SensorReading> highDataStream = splitStream.select("high");
        DataStream<SensorReading> lowDataStream = splitStream.select("low");
        DataStream<SensorReading> allDataStream = splitStream.select("high", "low");

        highDataStream.print("high");
        lowDataStream.print("low");
        allDataStream.print("all");

        env.execute();
    }
}

Tags: Java Linux Big Data flink

Posted on Tue, 28 Sep 2021 05:16:54 -0400 by BrentonHale