Big data Flink Source

1 predefined Source

1.1 set based Source

⚫ API
It is generally used when learning tests and making up data
1. Env.fromelements (variable parameters);
2. Env.fromcolletion (various sets);
3. Env.generatesequence (start, end);
4. Env.fromsequence (start, end);
⚫ Code demonstration:

package cn.oldlu.source;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Arrays;

/**
 * Author oldlu
 * Desc
 * Change the local ordinary Java collection / Scala collection into the distributed Flink DataStream collection!
 * It is generally used when learning tests and making up data
 * 1.env.fromElements(Variable parameters);
 * 2.env.fromColletion(Various collections);
 * 3.env.generateSequence(Start, end);
 * 4.env.fromSequence(Start, end);
 */
public class SourceDemo01 {
    public static void main(String[] args) throws Exception {
        //1.env
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        //2.source
        // *1. Env.fromelements (variable parameters);
        DataStream<String> ds1 = env.fromElements("hadoop", "spark", "flink");
        // *2. Env.fromcolletion (various sets);
        DataStream<String> ds2 = env.fromCollection(Arrays.asList("hadoop", "spark", "flink"));
        // *3. Env.generatesequence (start, end);
        DataStream<Long> ds3 = env.generateSequence(1, 10);
        //*4. Env.fromsequence (start, end);
        DataStream<Long> ds4 = env.fromSequence(1, 10);
        //3.Transformation
        //4.sink
        ds1.print();
        ds2.print();
        ds3.print();
        ds4.print();
        //5.execute
        env.execute();
    }
}

1.2 file based Source

⚫ API
Generally used for learning tests
Env.readtextfile (local / HDFS file / folder)// Compressed files can also be used
⚫ Code demonstration:

package cn.oldlu.source;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * Author oldlu
 * Desc
 * 1.env.readTextFile(Local / HDFS files / folders)// Compressed files can also be used
 */
public class SourceDemo02 {
    public static void main(String[] args) throws Exception {
        //1.env
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        //2.source
        // *1. Env.readtextfile (local file / HDFS file)// Compressed files can also be used
        DataStream<String> ds1 = env.readTextFile("data/input/words.txt");
        DataStream<String> ds2 = env.readTextFile("data/input/dir");
        DataStream<String> ds3 = env.readTextFile("hdfs://node1:8020//wordcount/input/words.txt");
        DataStream<String> ds4 = env.readTextFile("data/input/wordcount.txt.gz");
        //3.Transformation
        //4.sink
        ds1.print();
        ds2.print();
        ds3.print();
        ds4.print();
        //5.execute
        env.execute();
    }
}

1.3 Socket based Source

Generally used for learning tests
⚫ Requirements:
1. Use nc -lk 9999 on node1 to send data to the specified port. nc is the abbreviation of netcat. It was originally used to set up a router. We can use it to send data to a port. If there is no such command, we can install it
yum install -y nc
2. Use Flink to write a stream processing application to count the number of words in real time
⚫ Code implementation:

package cn.oldlu.source;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * Author oldlu
 * Desc
 * SocketSource
 */
public class SourceDemo03 {
    public static void main(String[] args) throws Exception {
        //1.env
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        //2.source
        DataStream<String> linesDS = env.socketTextStream("node1", 9999);

        //3. Data processing - transformation
        //3.1 each line of data is divided into words according to the space to form a set
        DataStream<String> wordsDS = linesDS.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String value, Collector<String> out) throws Exception {
                //value is row by row data
                String[] words = value.split(" ");
                for (String word : words) {
                    out.collect(word);//Collect and return the cut words one by one
                }
            }
        });
        //3.2 mark each word in the set as 1
        DataStream<Tuple2<String, Integer>> wordAndOnesDS = wordsDS.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String value) throws Exception {
                //value is the word that comes in one by one
                return Tuple2.of(value, 1);
            }
        });

        //3.3 group the data according to the word (key)
        //KeyedStream<Tuple2<String, Integer>, Tuple> groupedDS = wordAndOnesDS.keyBy(0);
        KeyedStream<Tuple2<String, Integer>, String> groupedDS = wordAndOnesDS.keyBy(t -> t.f0);
        //3.4 aggregate the data in each group according to the quantity (value), that is, sum
        DataStream<Tuple2<String, Integer>> result = groupedDS.sum(1);

        //4. Output result - sink
        result.print();

        //5. Trigger execute
        env.execute();
    }
}

2 custom Source

2.1 randomly generated data

⚫ API
It is generally used for learning, testing and simulation to generate some data. Flink also provides a data source interface. We can implement the interface to customize the data source. Different interfaces have different functions,
The classification is as follows:
SourceFunction: non parallel data source (parallelism can only be = 1)
RichSourceFunction: multifunctional non parallel data source (parallelism can only be = 1)
ParallelSourceFunction: parallel data source (parallelism > = 1)
RichParallelSourceFunction: multi-functional parallel data source (parallelism > = 1) – used by Kafka data source for subsequent learning
This is the interface
⚫ demand
An order information (order ID, user ID, order amount, time stamp) is randomly generated every 1 second
requirement:

  • Randomly generated order ID(UUID)
  • Randomly generated user ID(0-2)
  • Randomly generated order amount (0-100)
  • The timestamp is the current system time
    ⚫ code implementation
package cn.oldlu.source;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;

import java.util.Random;
import java.util.UUID;

/**
 * Author oldlu
 * Desc
 *demand
 * An order information (order ID, user ID, order amount, time stamp) is randomly generated every 1 second
 * requirement:
 * - Randomly generated order ID(UUID)
 * - Randomly generated user ID(0-2)
 * - Randomly generated order amount (0-100)
 * - The timestamp is the current system time
 *
 * API
 * It is generally used for learning test and simulation to generate some data
 * Flink It also provides a data source interface. By implementing this interface, we can realize custom data sources. Different interfaces have different functions, which are classified as follows:
 * SourceFunction:Non parallel data source (parallelism can only be = 1)
 * RichSourceFunction:Multifunctional non parallel data source (parallelism can only be = 1)
 * ParallelSourceFunction:Parallel data source (parallelism > = 1)
 * RichParallelSourceFunction:Multifunctional parallel data source (parallelism > = 1) -- the Kafka data source for subsequent learning uses this interface
 */
public class SourceDemo04_Customer {
    public static void main(String[] args) throws Exception {
        //1.env
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        //2.Source
        DataStream<Order> orderDS = env
                .addSource(new MyOrderSource())
                .setParallelism(2);

        //3.Transformation

        //4.Sink
        orderDS.print();
        //5.execute
        env.execute();
    }
    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    public static class Order {
        private String id;
        private Integer userId;
        private Integer money;
        private Long createTime;
    }
    public static class MyOrderSource extends RichParallelSourceFunction<Order> {
        private Boolean flag = true;
        @Override
        public void run(SourceContext<Order> ctx) throws Exception {
            Random random = new Random();
            while (flag){
                Thread.sleep(1000);
                String id = UUID.randomUUID().toString();
                int userId = random.nextInt(3);
                int money = random.nextInt(101);
                long createTime = System.currentTimeMillis();
                ctx.collect(new Order(id,userId,money,createTime));
            }
        }
        //Cancel task / execute when executing the cancle command
        @Override
        public void cancel() {
            flag = false;
        }
    }
}

2.2 MySQL

⚫ Requirements:
In actual development, you often receive some data in real time to match some rules stored in MySQL. At this time, you can use Flink custom data source to read data from mysql. Now, complete a simple requirement: load data from MySQL in real time
It is required that the data in MySQL can be loaded in real time if there are changes
⚫ Prepare data

CREATE TABLE `t_student` (
    `id` int(11) NOT NULL AUTO_INCREMENT,
    `name` varchar(255) DEFAULT NULL,
    `age` int(11) DEFAULT NULL,
    PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8;

INSERT INTO `t_student` VALUES ('1', 'jack', '18');
INSERT INTO `t_student` VALUES ('2', 'tom', '19');
INSERT INTO `t_student` VALUES ('3', 'rose', '20');
INSERT INTO `t_student` VALUES ('4', 'tom', '19');
INSERT INTO `t_student` VALUES ('5', 'jack', '18');
INSERT INTO `t_student` VALUES ('6', 'rose', '20');

⚫ Code implementation:

package cn.oldlu.source;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.concurrent.TimeUnit;

/**
 * Author oldlu
 * Desc
 * Requirements:
 * In actual development, you often receive some data in real time to match some rules stored in MySQL. At this time, you can use Flink custom data source to read data from mysql
 * Now complete a simple requirement:
 * Load data from MySQL in real time
 * It is required that the data in MySQL can be loaded in real time if there are changes
 */
public class SourceDemo05_Customer_MySQL {
    public static void main(String[] args) throws Exception {
        //1.env
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //2.Source
        DataStream<Student> studentDS = env.addSource(new MySQLSource()).setParallelism(1);

        //3.Transformation
        //4.Sink
        studentDS.print();

        //5.execute
        env.execute();
    }

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    public static class Student {
        private Integer id;
        private String name;
        private Integer age;
    }

    public static class MySQLSource extends RichParallelSourceFunction<Student> {
        private Connection conn = null;
        private PreparedStatement ps = null;

        @Override
        public void open(Configuration parameters) throws Exception {
            //Load the drive and open the connection
            //Class.forName("com.mysql.jdbc.Driver");
            conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata", "root", "root");
            String sql = "select id,name,age from t_student";
            ps = conn.prepareStatement(sql);
        }

        private boolean flag = true;

        @Override
        public void run(SourceContext<Student> ctx) throws Exception {
            while (flag) {
                ResultSet rs = ps.executeQuery();
                while (rs.next()) {
                    int id = rs.getInt("id");
                    String name = rs.getString("name");
                    int age = rs.getInt("age");
                    ctx.collect(new Student(id, name, age));
                }
                TimeUnit.SECONDS.sleep(5);
            }
        }
        @Override
        public void cancel() {
            flag = false;
        }
        @Override
        public void close() throws Exception {
            if (conn != null) conn.close();
            if (ps != null) ps.close();
        }
    }
}

Tags: Big Data Hadoop flink

Posted on Wed, 08 Sep 2021 16:53:14 -0400 by erick_w