- Collection based source
- File based source
- Socket based source
- Custom source
import org.apache.flink.streaming.api.scala. import scala.collection.immutable. import scala.collection.mutable import scala.collection.mutable. object DataSource001 { def main(args: Array[String]): Unit = { val senv = StreamExecutionEnvironment.getExecutionEnvironment //0. Create datastream with element (fromelements) val ds0: DataStream[String] = senv.fromElements("spark", "flink") ds0.print() //1. Use Tuple to create DataStream(fromElements) val ds1: DataStream[(Int, String)] = senv.fromElements((1, "spark"), (2, "flink")) ds1.print() //2. Create DataStream with Array val ds2: DataStream[String] = senv.fromCollection(Array("spark", "flink")) ds2.print() //3. Create DataStream with ArrayBuffer val ds3: DataStream[String] = senv.fromCollection(ArrayBuffer("spark", "flink")) ds3.print() //4. Create DataStream with List val ds4: DataStream[String] = senv.fromCollection(List("spark", "flink")) ds4.print() //5. Create DataStream with List val ds5: DataStream[String] = senv.fromCollection(ListBuffer("spark", "flink")) ds5.print() //6. Create DataStream with Vector val ds6: DataStream[String] = senv.fromCollection(Vector("spark", "flink")) ds6.print() //7. Create DataStream with Queue val ds7: DataStream[String] = senv.fromCollection(Queue("spark", "flink")) ds7.print() //8. Create DataStream with Stack val ds8: DataStream[String] = senv.fromCollection(Stack("spark", "flink")) ds8.print() //9. Create DataStream with Stream (Stream is equivalent to lazy List to avoid generating unnecessary collection in intermediate process) val ds9: DataStream[String] = senv.fromCollection(Stream("spark", "flink")) ds9.print() //10. Create DataStream with Seq val ds10: DataStream[String] = senv.fromCollection(Seq("spark", "flink")) ds10.print() //11. Create datastream with Set (not supported) //val ds11: DataStream[String] = senv.fromCollection(Set("spark", "flink")) //ds11.print() //12. Create datastream with Iterable (not supported) //val ds12: DataStream[String] = senv.fromCollection(Iterable("spark", "flink")) //ds12.print() //13. Create DataStream with ArraySeq val ds13: DataStream[String] = senv.fromCollection(mutable.ArraySeq("spark", "flink")) ds13.print() //14. Create DataStream with ArrayStack val ds14: DataStream[String] = senv.fromCollection(mutable.ArrayStack("spark", "flink")) ds14.print() //15. Create datastream with Map (not supported) //val ds15: DataStream[(Int, String)] = senv.fromCollection(Map(1 -> "spark", 2 -> "flink")) //ds15.print() //16. Create DataStream with Range val ds16: DataStream[Int] = senv.fromCollection(Range(1, 9)) ds16.print() //17. Create DataStream with fromElements val ds17: DataStream[Long] = senv.generateSequence(1, 9) ds17.print() senv.execute(this.getClass.getName) } }File based source
//TODO 2. File based source //0. Create a running environment val env = StreamExecutionEnvironment.getExecutionEnvironment //TODO 1. Read local file val text1 = env.readTextFile("data2.csv") text1.print() //TODO 2. Read hdfs file val text2 = env.readTextFile("hdfs://hadoop01:9000/input/flink/README.txt") text2.print() env.execute()Socket based source
val source = env.socketTextStream("IP", PORT)Custom source (take kafka as an example)
Kafka basic command:
- View all topic s in the current server
bin/kafka-topics.sh --list --zookeeper hadoop01:2181
- Create topic
bin/kafka-topics.sh --create --zookeeper hadoop01:2181 --replication-factor 1 --partitions 1 --topic test
- Delete topic
sh bin/kafka-topics.sh --delete --zookeeper zk01:2181 --topic test
You need to set delete.topic.enable=true in server.properties. Otherwise, it's just to mark deletion or restart directly.
- Sending messages through shell commands
sh bin/kafka-console-producer.sh --broker-list hadoop01:9092 --topic test
- Consume messages through shell
bin/kafka-console-consumer.sh --zookeeper hadoop01:2181 --from-beginning --topic test1
- View consumption location
bin/kafka-run-cla.ss.sh kafka.tools.ConsumerOffsetChecker --zookeeper zk01:2181 --group testGroup
- View details of a Topic
bin/kafka-topics.sh --topic test --describe --zookeeper zk01:2181
- Modify the number of partitions
kafka-topics.sh --zookeeper zk01 --alter --partitions 15 --topic utopic
Use flink to consume kafka messages (it is not standardized, but you need to manually maintain the offset yourself):
import java.util.Properties import org.apache.flink.streaming.api.scala. import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09 import org.apache.flink.streaming.util.serialization.SimpleStringSchema import org.apache.flink.api.scala._ /** * Created by angel; */ object DataSource_kafka { def main(args: Array[String]): Unit = { //1 specify information about kafka data flow val zkCluster = "hadoop01,hadoop02,hadoop03:2181" val kafkaCluster = "hadoop01:9092,hadoop02:9092,hadoop03:9092" val kafkaTopicName = "test" //2. Create a flow processing environment val env = StreamExecutionEnvironment.getExecutionEnvironment //3. Create kafka data flow val properties = new Properties() properties.setProperty("bootstrap.servers", kafkaCluster) properties.setProperty("zookeeper.connect", zkCluster) properties.setProperty("group.id", kafkaTopicName) val kafka09 = new FlinkKafkaConsumer09[String](kafkaTopicName, new SimpleStringSchema(), properties) //4. Add data source addSource(kafka09) val text = env.addSource(kafka09).setParallelism(4) /** * test#CS#request http://b2c.csair.com/B2C40/query/jaxb/direct/query.ao?t=S&c1=HLN&c2=CTU&d1=2018-07-12&at=2&ct=2&inf=1#CS#POST#CS#application/x-www-form-urlencoded#CS#t=S&json={'adultnum':'1','arrcity':'NAY','childnum':'0','depcity':'KHH','flightdate':'2018-07-12','infantnum':'2'}#CS#http://b2c.csair.com/B2C40/modules/bookingnew/main/flightSelectDirect.html?t=R&c1=LZJ&c2=MZG&d1=2018-07-12&at=1&ct=2&inf=2#CS#123.235.193.25#CS#Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1#CS#2018-01-19T10:45:13:578+08:00#CS#106.86.65.18#CS#cookie * */ val values: DataStream[ProcessedData] = text.map{ line => var encrypted = line val values = encrypted.split("#CS#") val valuesLength = values.length var regionalRequest = if(valuesLength > 1) values(1) else "" val requestMethod = if (valuesLength > 2) values(2) else "" val contentType = if (valuesLength > 3) values(3) else "" //Data body submitted by Post val requestBody = if (valuesLength > 4) values(4) else "" //http_referrer val httpReferrer = if (valuesLength > 5) values(5) else "" //Client IP val remoteAddr = if (valuesLength > 6) values(6) else "" //Client UA val httpUserAgent = if (valuesLength > 7) values(7) else "" //ISO8610 format of server time val timeIso8601 = if (valuesLength > 8) values(8) else "" //server address val serverAddr = if (valuesLength > 9) values(9) else "" //Get the cookie string in the original information val cookiesStr = if (valuesLength > 10) values(10) else "" ProcessedData(regionalRequest, requestMethod, contentType, requestBody, httpReferrer, remoteAddr, httpUserAgent, timeIso8601, serverAddr, cookiesStr) } values.print() val remoteAddr: DataStream[String] = values.map(line => line.remoteAddr) remoteAddr.print() //5. Trigger operation env.execute("flink-kafka-wordcunt") } } //Save structured data case class ProcessedData(regionalRequest: String, requestMethod: String, contentType: String, requestBody: String, httpReferrer: String, remoteAddr: String, httpUserAgent: String, timeIso8601: String, serverAddr: String, cookiesStr: String )