Analysis of Flink users' e-commerce behavior
1. Real time statistical analysis
1.1 statistics of popular commodities
-
Demand Description: display the top n of the popular products of the website within 1 hour every 5 minutes
-
Data form displayed:
Time window information:
NO 1: Product ID + number of views 1
NO 2: Product ID + number of views 2
NO 1. Item ID + number of views 3
-
Implementation idea:
-
- Because after all keybyes of window information + commodity ID are finally required, the full window function is required to get the window time + key
-
- Moreover, the number of views is required, so the incremental aggregation function keyBy is required to aggregate one piece of data, and then one piece of data is aggregated incrementally to get the number of views
-
- After the above steps 1 and 2, you can only get the browsing times of one commodity. Therefore, in order to get the browsing times of one commodity within one hour, use the commodity in the processFunction window to save it to the ListStat according to the time window keyBy. The timer reaches the window deadline and outputs the data of the ListStat
-
-
code
/** * What to do: count the popular products in an hour and update the results every 5 minutes * How do you do it? * 1.Since the product information within 1 hour is output, that is, the historical data is output, and it is triggered every 5 minutes, that is, it is triggered at the end of the window * Output status information saved within 5 minutes * Output: window end time item ID hot number * <p> * 2 Then it is necessary to count the number of popular product ID S at the end of the product * Hot number: incremental aggregate function * End time + item ID: full window * <p> * Output results: * Window end time: 2017-11-26 12:20:00.0 * Window content: * NO 1: Item ID = 2338453 popularity = 27 * NO 2: Item ID = 812879 popularity = 18 * NO 3: Item ID = 4443059 popularity = 18 * NO 4: Item ID = 3810981 popularity = 14 * NO 5: Item ID = 2364679 popularity = 14 */ public class HotItemsPractise { public static void main(String[] args) throws Exception { // 1. Environmental preparation StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(1); DataStreamSource<String> inputStream = env.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/UserBehavior.csv"); //2. Prepare data source DataStream<ItemBean> filterStream = inputStream.map(line -> { String[] split = line.split(","); return new ItemBean(Long.parseLong(split[0]), Long.parseLong(split[1]), Integer.parseInt(split[2]), split[3], Long.parseLong(split[4])); }).filter(item -> "pv".equals(item.getBehavior())); //3. Collect the aggregation results of a commodity DataStream<ItemViewCount> windowsResult = filterStream.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<ItemBean>() { @Override public long extractAscendingTimestamp(ItemBean element) { return element.getTimestamp() * 1000L; } }).keyBy("itemId") .timeWindow(Time.hours(1), Time.minutes(5)) .aggregate(new MyAggreateCount(), new MyAllWindowsView()); //4. Collect one hour of aggregation results SingleOutputStreamOperator<String> windowEnd = windowsResult .keyBy("windowEnd") .process(new ItemHotTopN(5)); windowEnd.print(); env.execute("HotItemsPractise"); } /** * Incremental aggregation of window functions to get the number of views of the same product */ public static class MyAggreateCount implements AggregateFunction<ItemBean, Long, Long> { @Override public Long createAccumulator() { return 0L; } @Override public Long add(ItemBean value, Long accumulator) { return accumulator + 1L; } @Override public Long getResult(Long accumulator) { return accumulator; } @Override public Long merge(Long a, Long b) { return null; } } /** * Full function: the input value is the result of incremental aggregation + key, in order to get the deadline of the time window information window + commodity ID */ public static class MyAllWindowsView implements WindowFunction<Long, ItemViewCount, Tuple, TimeWindow> { /** * @param tuple * @param window * @param input * @param out * @throws Exception */ @Override public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<ItemViewCount> out) throws Exception { long windowEnd = window.getEnd(); long count = input.iterator().next(); long itemId = tuple.getField(0); out.collect(new ItemViewCount(itemId, windowEnd, count)); } } /** * Save the goods within one hour, and the time window reaches the sorting output TopN */ public static class ItemHotTopN extends KeyedProcessFunction<Tuple, ItemViewCount, String> { ListState<ItemViewCount> itemViewCountListState; private int topN; public ItemHotTopN(int topN) { this.topN = topN; } @Override public void open(Configuration parameters) throws Exception { itemViewCountListState = getRuntimeContext().getListState(new ListStateDescriptor<ItemViewCount>("itemViewCount", ItemViewCount.class)); } @Override public void processElement(ItemViewCount itemViewCount, Context ctx, Collector<String> out) throws Exception { itemViewCountListState.add(itemViewCount); ctx.timerService().registerEventTimeTimer(itemViewCount.getWindowEnd() + 1L); } @Override public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception { // ListState to ArrayList ArrayList<ItemViewCount> arraylist = Lists.newArrayList(itemViewCountListState.get().iterator()); arraylist.sort(new Comparator<ItemViewCount>() { @Override public int compare(ItemViewCount o1, ItemViewCount o2) { return o2.getCount().intValue() - o1.getCount().intValue(); } }); StringBuilder resultStringBuilder = new StringBuilder(); resultStringBuilder.append("===================================" + "\n"); resultStringBuilder.append("Window end time:").append(new Timestamp(timestamp).toString()).append("\n"); for (int i = 0; i < Math.min(topN, arraylist.size()); i++) { resultStringBuilder .append("NO ") .append(i + 1) .append(": commodity ID = ") .append(arraylist.get(i).getItemId()) .append(" Popularity = ") .append(arraylist.get(i).getCount()) .append("\n"); } resultStringBuilder.append("===================================\n"); out.collect(resultStringBuilder.toString()); Thread.sleep(1000L); } } }
1.2 popular page statistics
- Demand: output popular pages browsed within one hour every 5 minutes
- Display of output results:
Window end time: 2015-05-18 13:08:50.0
NO 1: page URL = /blog/tags/puppet?flav=rss20 popularity = 11
NO 2: page url = / projects / xdotool / xdotool XHTML popularity = 5
NO 3: page URL = /projects/xdotool / popularity = 4
NO 4: page URL = /?flav=rss20 popularity = 4
NO 5: page url = / robots Txt popularity = 4
-
Implementation idea: different from the previous one, the time of the data in the data source is non incremental
- How to ensure that disordered data is not lost
- 1. Therefore, set the disorder degree between watermark and data source
- 2. Set a certain window delay closing time. When the initial time window comes, aggregate the data first, and then calculate and output one data belonging to the window
- 3. Any late data will be directly thrown into the side output stream
- How to ensure that subsequent late data overwrites the previous data
- 1. Open incremental aggregation first, then full window aggregation, and then group according to the window deadline
- 2 open a window according to the deadline of the window. key by collects all data within the deadline of the window and sorts the output
- 3 if the delay data comes back later, the previous results need to be updated. Therefore, save the data in mapstat. key is the page URL and value is the output result
- How to ensure that disordered data is not lost
-
code
/** * * - _ooOoo_ * - o8888888o * - 88" . "88 * - (| -_- |) * - O\ = /O * - ____/`---'\____ * - . ' \\| |// `. * - / \\||| : |||// \ * - / _||||| -:- |||||- \ * - | | \\\ - /// | | * - | \_| ''\---/'' | | * - \ .-\__ `-` ___/-. / * - ___`. .' /--.--\ `. . __ * - ."" '< `.___\_<|>_/___.' >'"". * - | | : `- \`.;`\ _ /`;.`/ - ` : | | * - \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * `=---=' * ............................................. * Buddha bless never BUG * <p> * demand * Output the top 5 pages within 1 hour every 5 minutes * The results are counted once every hour, that is, opening the window is to collect the statistical results within one hour and output the results in the window according to the end time of the window. The sliding step of the window is set to 5min */ public class HotPages { public static void main(String[] args) throws Exception { StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); executionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); executionEnvironment.setParallelism(1); DataStreamSource<String> stringDataStreamSource = executionEnvironment.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/apache.log"); SimpleDateFormat simpleFormatter = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss"); OutputTag<ApacheLogEvent> lateTag = new OutputTag<ApacheLogEvent>("late_date") { }; DataStream<PageViewCount> streamPageViewCount = stringDataStreamSource.map(line -> { String[] s = line.split(" "); // Date to time stamp Long timestamp = simpleFormatter.parse(s[3]).getTime(); return new ApacheLogEvent(s[0], s[1], timestamp, s[5], s[6]); }).filter(date -> "GET".equals(date.getMethod())) .filter(data -> { // End of CSS JS PNG ICO at filter String regex = "((?!\\.(css|js|png|ico|jpg)$).)*$"; return Pattern.matches(regex, data.getUrl()); }).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.seconds(1)) { @Override public long extractTimestamp(ApacheLogEvent apacheLogEvent) { return apacheLogEvent.getTimestamp(); } }).keyBy("url") .timeWindow(Time.minutes(10), Time.seconds(5)) .allowedLateness(Time.minutes(1)) .sideOutputLateData(lateTag) .aggregate(new HotPageIncreaseAgg(), new HotPageAllAgg()); SingleOutputStreamOperator<String> windowEnd = streamPageViewCount .keyBy("windowEnd") .process(new MyProcessFunction(5)); // console output windowEnd.print("data"); windowEnd.getSideOutput(lateTag).print("late_date"); executionEnvironment.execute(); } public static class HotPageIncreaseAgg implements AggregateFunction<ApacheLogEvent, Long, Long> { @Override public Long createAccumulator() { return 0L; } @Override public Long add(ApacheLogEvent value, Long accumulator) { return accumulator + 1; } @Override public Long getResult(Long accumulator) { return accumulator; } @Override public Long merge(Long a, Long b) { return a + b; } } public static class HotPageAllAgg implements WindowFunction<Long, PageViewCount, Tuple, TimeWindow> { @Override public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<PageViewCount> out) throws Exception { String url = tuple.getField(0); Long count = input.iterator().next(); long windowEnd = window.getEnd(); out.collect(new PageViewCount(url, windowEnd, count)); } } public static class MyProcessFunction extends KeyedProcessFunction<Tuple, PageViewCount, String> { private Integer topSize; MapState<String, Long> hotPageCount; public MyProcessFunction(Integer topSize) { this.topSize = topSize; } @Override public void open(Configuration parameters) throws Exception { hotPageCount = getRuntimeContext().getMapState(new MapStateDescriptor<String, Long>("hot_page_count", String.class, Long.class)); } /** * If there is late data that needs to be overwritten * Then defining a map and adding the same key will be overwritten * If the time exceeds 1 minute, clear the status */ @Override public void processElement(PageViewCount pageViewCount, Context ctx, Collector<String> out) throws Exception { // If the map type is the same as the key, it will be updated hotPageCount.put(pageViewCount.getUrl(),pageViewCount.getCount()); ctx.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd()+1); } /** * Output the results in the map * Timer trigger time: watermark > = timing time * @param timestamp * @param ctx * @param out * @throws Exception */ @Override public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception { Long currentKey = ctx.getCurrentKey().getField(0); // Judge whether it is time to close the window for cleaning. If it is in the direct emptying state if (timestamp == currentKey + 60 * 1000L) { hotPageCount.clear(); return; } ArrayList<Map.Entry<String, Long>> pageViewCounts = Lists.newArrayList(hotPageCount.entries()); pageViewCounts.sort(new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { if(o1.getValue() > o2.getValue()) return -1; else if(o1.getValue() < o2.getValue()) return 1; else return 0; } }); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("===================================\n"); stringBuilder.append("Window end time:").append(new Timestamp(timestamp - 1)).append("\n"); for (int i = 0; i < Math.min(topSize, pageViewCounts.size()); i++) { Map.Entry<String, Long> stringLongEntry = pageViewCounts.get(i); stringBuilder.append("NO ").append(i + 1).append(":") .append(" page URL = ").append(stringLongEntry.getKey()) .append(" Popularity = ").append(stringLongEntry.getValue()) .append("\n"); } stringBuilder.append("===============================\n\n"); // Control output frequency Thread.sleep(1000L); out.collect(stringBuilder.toString()); } } }
1.3 website uv statistics
-
Demand: real-time output of website uv per hour
-
Output format: window deadline + number of independent visitors to the window
-
Implementation idea:
-
1. Set the scrolling window to 1 hour, and each piece of data will trigger the calculation, so you need to customize the trigger,
-
2 . The trigger method is to trigger the subsequent statistical logic for each data. The uerid de duplication logic is to query each data in redis according to the uerid. If there is any, it will be discarded. If there is no, it will be counted + 1.
-
- When each data comes, the user ID is parsed. The location in the bitmap is parsed according to the custom hash function. If the value of the query location is 1, the number of accesses corresponding to the window deadline is taken out and output. If 1 is set to 1, the number of accesses + 1 output is taken out and the updated count is stored in redis
Storage format:
Count: hash resu lt storage format "uv_count", < window deadline, number of accesses >
uesrId: bitmap
hash function: the statistics result of Ascii*seed + previous bit of the current bit of userId
-
-
code
/** * @author :LiangFangWei * @date: 2021-12-21 15:55 * - _ooOoo_ * - o8888888o * - 88" . "88 * - (| -_- |) * - O\ = /O * - ____/`---'\____ * - . ' \\| |// `. * - / \\||| : |||// \ * - / _||||| -:- |||||- \ * - | | \\\ - /// | | * - | \_| ''\---/'' | | * - \ .-\__ `-` ___/-. / * - ___`. .' /--.--\ `. . __ * - ."" '< `.___\_<|>_/___.' >'"". * - | | : `- \`.;`\ _ /`;.`/ - ` : | | * - \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * ............................................. * - Buddha bless never BUG * <p> * Demand: real time output statistics of uv within each hour. Real time output of user de duplication number per hour * Idea: * <p> * In another hour's time window, each piece of data triggers the calculation. * Calculation logic: * 1.Take the current data to the redis bitmap to check whether it is available * The query key is the end time of the time window * The offset of the query is the hash value of userID * 2. If the location of the query is not set to 1 * <p> * Check whether there is any in the bitmap of redis * If there is discard, if there is no count+1, save the new value to redis */ public class HotUVWithBloomFilter { public static void main(String[] args) throws Exception { //1. Environmental preparation StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); executionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); executionEnvironment.setParallelism(1); // 2. Prepare data DataStreamSource<String> inputStream = executionEnvironment.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/UserBehavior.csv"); SingleOutputStreamOperator<ItemBean> filterData = inputStream.map(line -> { String[] split = line.split(","); return new ItemBean(Long.parseLong(split[0]), Long.parseLong(split[1]), Integer.parseInt(split[2]), split[3], Long.parseLong(split[4])); }).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<ItemBean>() { @Override public long extractAscendingTimestamp(ItemBean element) { return element.getTimestamp() * 1000L; } }).filter(itemBean -> "pv".equals(itemBean.getBehavior())); //2. The scrolling window is 1 hour SingleOutputStreamOperator<PageViewCount> streamOperator = filterData .timeWindowAll(Time.hours(1)) //3. To define a trigger, you need to define that each piece of data triggers the calculation instead of waiting for all windows to trigger the calculation .trigger(new UVTriigger()) // 4. The calculation logic goes to the redis bitmap to check whether there is a current userID .process(new UVProcessFunction()); // 5 if not, it needs to be inserted streamOperator.print(); executionEnvironment.execute(); } /** * To define a static internal class, you do not need to write the class definition in the class file */ public static class UVTriigger extends Trigger<ItemBean, TimeWindow> { @Override public TriggerResult onElement(ItemBean element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception { return TriggerResult.FIRE_AND_PURGE; } @Override public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { return TriggerResult.CONTINUE; } @Override public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { return TriggerResult.CONTINUE; } @Override public void clear(TimeWindow window, TriggerContext ctx) throws Exception { } } public static class UVProcessFunction extends ProcessAllWindowFunction<ItemBean, PageViewCount, TimeWindow> { private Jedis jedis; private String pageCountKey = "uv_page_count"; private BloomFilter bloomFilter; @Override public void open(Configuration parameters) throws Exception { jedis = new Jedis("localhost", 6379); bloomFilter = new BloomFilter(1 << 29); } /** * Check a piece of data in redis * * @param context * @param elements * @param out * @throws Exception */ @Override public void process(Context context, Iterable<ItemBean> elements, Collector<PageViewCount> out) throws Exception { Long windowEnd1 = context.window().getEnd(); String windowEnd = windowEnd1.toString(); ItemBean itemBean = elements.iterator().next(); Long userId = itemBean.getUserId(); long offset = bloomFilter.hash(userId.toString(), 61); Boolean isExist = jedis.getbit(windowEnd, offset); if (!isExist) { jedis.setbit(windowEnd, offset, true); // The count value + 1 cont value is stored as a hash structure Long uvCount = 0L; // Initial count value String uvCountString = jedis.hget(pageCountKey, windowEnd); if (StringUtils.isNoneBlank(uvCountString)) { uvCount = Long.valueOf(uvCountString); } jedis.hset(pageCountKey, windowEnd, String.valueOf(uvCount + 1)); out.collect(new PageViewCount("uv", windowEnd1, uvCount + 1)); } } } public static class BloomFilter { // If you want to go to the power of 2, result & (capacity-1) is the remainder private long capacity; public BloomFilter(long capacity) { this.capacity = capacity; } public long hash(String userId, int seed) { long result = 0L; for (int i = 0; i < userId.length(); i++) { result = result * seed + userId.charAt(i); } return result & (capacity - 1); } } }
2. Business process and risk control
2.1 page advertisement blacklist filtering
-
Demand: output the hits of each advertisement in each province. The statistical period is one hour and the output interval is 5 minutes. It is required that if someone clicks an advertisement more than 3 times on that day, the user will be output to the side output stream. If the user clicks the advertisement again within the same day, it will be counted as invalid and no statistics will be made
-
Output format:
-
blacklist-user> BlackAdUerInfo(uerId=937166, adId=1715, count=click over 3times.)
blacklist-user> BlackAdUerInfo(uerId=161501, adId=36156, count=click over 3times.)—>> AdOutputInfo(province=beijing, windowEnd=2017-11-26 09:25:00.0, count=2)
—>> AdOutputInfo(province=guangdong, windowEnd=2017-11-26 09:25:00.0, count=5)
—>> AdOutputInfo(province=beijing, windowEnd=2017-11-26 09:25:00.0, count=2)
—>> AdOutputInfo(province=beijing, windowEnd=2017-11-26 09:30:00.0, count=2)
—>> AdOutputInfo(province=guangdong, windowEnd=2017-11-26 09:30:00.0, count=5)
—>> AdOutputInfo(province=shanghai, windowEnd=2017-11-26 09:30:00.0, count=2)
-
-
Statistical logic:
- How to filter abnormal data: group according to the uerid+adId keyBy and then use the process. Judge whether each piece of data in the partition reaches the set number of hits. If not, + 1, and output the record. Add the user's uerID to the blacklist (side output stream) and register the timer in the early morning of the next day. The timer will be cleared the next day and use the state of good number of hits
- The statistics of the number of advertisements in the back are the same as those in the front
-
code
/** * - _ooOoo_ * - o8888888o * - 88" . "88 * - (| -_- |) * - O\ = /O * - ____/`---'\____ * - . ' \\| |// `. * - / \\||| : |||// \ * - / _||||| -:- |||||- \ * - | | \\\ - /// | | * - | \_| ''\---/'' | | * - \ .-\__ `-` ___/-. / * - ___`. .' /--.--\ `. . __ * - ."" '< `.___\_<|>_/___.' >'"". * - | | : `- \`.;`\ _ /`;.`/ - ` : | | * - \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * ............................................. * - Buddha bless never BUG * * @author :LiangFangWei * @date: 2021-12-23 18:58 * @description: Count the number of clicks of each advertisement in each province. If a user clicks more than the number of ads on the same day, it will be output as a stream * <p> * Idea: * 1.Final output form * (Province, window deadline, total) * 2. Create an incremental aggregation to get the total number. The full window function can get the window deadline and key * 3. Filter the abnormal data. If the user clicks on the same advertisement more than a certain number of times on a certain day, it will be output separately as a stream * 3.1 It is necessary to save the status of a user's clicks on an advertisement. If it exceeds 100 times, it will be added to the blacklist. If it is returned directly in the blacklist, it will not be processed and counted * 3.2 If it is not exceeded, the number of times + 1 outputs data */ public class AdStatisticsByProvince { public static void main(String[] args) throws Exception { StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); executionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); executionEnvironment.setParallelism(1); DataStream<String> inputStream = executionEnvironment.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/AdClickLog.csv"); DataStream<AdvertInfo> processStream1 = inputStream.map(line -> { String[] split = line.split(","); return new AdvertInfo(split[0], split[1], split[2], Long.parseLong(split[4])); }).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<AdvertInfo>() { @Override public long extractAscendingTimestamp(AdvertInfo element) { return element.getTimeStramp() * 1000L; } }); // Filter out abnormal stream data SingleOutputStreamOperator<AdvertInfo> fliterBlackStream = processStream1 .keyBy("userId", "adId") .process(new BlackUserProcess(3)); DataStream<AdOutputInfo> resultStream = fliterBlackStream .keyBy("province") .timeWindow(Time.hours(1), Time.minutes(5)) .aggregate(new IncreaseAggreateEle(), new AllAggreateCount()); fliterBlackStream.getSideOutput(new OutputTag<BlackAdUerInfo>("blacklist"){}).print("blacklist-user"); resultStream.print("--->"); executionEnvironment.execute(); } /** * Filter abnormal data */ public static class BlackUserProcess extends KeyedProcessFunction<Tuple, AdvertInfo, AdvertInfo> { ValueState<Long> adClickCount; ValueState<Boolean> isBlackUser; private int bound; public BlackUserProcess(int bound) { this.bound = bound; } @Override public void open(Configuration parameters) throws Exception { adClickCount = getRuntimeContext().getState(new ValueStateDescriptor<Long>("ad_click_count", Long.class, 0l)); isBlackUser = getRuntimeContext().getState(new ValueStateDescriptor<Boolean>("is_black_user", Boolean.class, false)); } /** * @param value * @param ctx * @param out * @throws Exception */ @Override public void processElement(AdvertInfo value, Context ctx, Collector<AdvertInfo> out) throws Exception { // 1. Judge whether the set boundary is reached. Note that the state is only kept for one day Long userIdClickCount = adClickCount.value(); // Register the timer for the next day if it reaches the clear state Long timestamp = ctx.timerService().currentProcessingTime(); Long clserTime = ((timestamp / 24 * 60 * 60 * 1000L) + 1) * 24 * 60 * 60 * 1000L - 8 * 60 * 60 * 1000; ctx.timerService().registerEventTimeTimer(clserTime); // 2. If the boundary is set if (userIdClickCount >= bound) { // 2.1 not on the blacklist if (!isBlackUser.value()) { // Add blacklist to side output stream isBlackUser.update(true); ctx.output(new OutputTag<BlackAdUerInfo>("blacklist") { }, new BlackAdUerInfo(value.getUserId(), value.getAdId(), "click over " + userIdClickCount + "times.")); } // 2.2 return directly in the blacklist return; } // 3. If the set boundary update status is not reached, this data is output adClickCount.update(userIdClickCount+1); out.collect(value); } @Override public void onTimer(long timestamp, OnTimerContext ctx, Collector<AdvertInfo> out) throws Exception { adClickCount.clear(); isBlackUser.clear(); } } public static class IncreaseAggreateEle implements AggregateFunction<AdvertInfo, Long, Long> { @Override public Long createAccumulator() { return 0L; } @Override public Long add(AdvertInfo value, Long accumulator) { return accumulator+1; } @Override public Long getResult(Long accumulator) { return accumulator; } @Override public Long merge(Long a, Long b) { return a+b; } } public static class AllAggreateCount implements WindowFunction<Long, AdOutputInfo, Tuple, TimeWindow> { @Override public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<AdOutputInfo> out) throws Exception { Timestamp formateDate = new Timestamp(window.getEnd()); out.collect(new AdOutputInfo(tuple.getField(0).toString(),formateDate.toString(),input.iterator().next())); } } }
2.2 malicious login monitoring
-
Requirement: detect 2 users who fail to log in continuously within two seconds
-
Implementation idea: CEP programming defines the rule of two consecutive login failures within 2 seconds, and applies the flow to the changed rule to filter out the flow after applying the rule
/** * - _ooOoo_ * - o8888888o * - 88" . "88 * - (| -_- |) * - O\ = /O * - ____/`---'\____ * - . ' \\| |// `. * - / \\||| : |||// \ * - / _||||| -:- |||||- \ * - | | \\\ - /// | | * - | \_| ''\---/'' | | * - \ .-\__ `-` ___/-. / * - ___`. .' /--.--\ `. . __ * - ."" '< `.___\_<|>_/___.' >'"". * - | | : `- \`.;`\ _ /`;.`/ - ` : | | * - \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * ............................................. * - Buddha bless never BUG * * @author :LiangFangWei * @date: 2021-12-26 17:11 * @description: Continuous login failure detection outputs a record of three login failures within two seconds */ public class LoginCheck { public static void main(String[] args) throws Exception { // 1. Define the environment StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); executionEnvironment.setParallelism(1); executionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStreamSource<String> stringDataStreamSource = executionEnvironment.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/LoginLog.csv"); // 2. Packaging as the object KeyedStream<LoginInfo, Tuple> keyedStream = stringDataStreamSource.map(line -> { String[] split = line.split(","); return new LoginInfo(split[0], split[2], Long.parseLong(split[3])); }).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<LoginInfo>(Time.seconds(3)) { @Override public long extractTimestamp(LoginInfo element) { return element.getTimeStamp() * 1000L; } }).keyBy("status"); // 3. Definition rules // 3.1 creating rules Pattern<LoginInfo, LoginInfo> failPattern = Pattern.<LoginInfo>begin("loginFailEvent").where(new SimpleCondition<LoginInfo>() { @Override public boolean filter(LoginInfo value) throws Exception { return "fail".equals(value.getStatus()); } // Failed to log in for three consecutive times, and the confidential is set to strict nearest neighbor }).times(3).consecutive().within(Time.seconds(5)); // 3.2 matching rules to streams PatternStream<LoginInfo> pattern = CEP.pattern(keyedStream, failPattern); // 3.3 filtering data SingleOutputStreamOperator selectStream = pattern.select(new PatternSelectFunction<LoginInfo, LoginFailInfo>() { /* * Map Stored in is the data on the rule matching */ @Override public LoginFailInfo select(Map<String, List<LoginInfo>> pattern) throws Exception { List<LoginInfo> loginFailEvent = pattern.get("loginFailEvent"); LoginInfo firstFail = loginFailEvent.get(0); String userId = firstFail.getUserId(); LoginInfo lastFail = pattern.get("loginFailEvent").get(loginFailEvent.size()-1); Timestamp firstFailTimeStamp = new Timestamp(firstFail.getTimeStamp() * 1000L); Timestamp secondFailTimeStamp = new Timestamp(lastFail.getTimeStamp() * 1000L); return new LoginFailInfo(userId, firstFailTimeStamp.toString(), secondFailTimeStamp.toString(), "continuity"+loginFailEvent.size()+"Login failed"); } }); selectStream.print(); executionEnvironment.execute(); } }
2.3 order payment failure monitoring
-
Demand: real-time detection is to place an order within 15 minutes or pay for an order
-
Implementation logic:
- Define CEP rules: there are rules for ordering and payment within 15 minutes.
- Match to stream
- Filter the matching data from the matching stream, and parse the delayed data and non delayed data from the map
- The matching data is encapsulated in the map
- If there is no matching data, it will also be output to the map
-
code
/** * - _ooOoo_ * - o8888888o * - 88" . "88 * - (| -_- |) * - O\ = /O * - ____/`---'\____ * - . ' \\| |// `. * - / \\||| : |||// \ * - / _||||| -:- |||||- \ * - | | \\\ - /// | | * - | \_| ''\---/'' | | * - \ .-\__ `-` ___/-. / * - ___`. .' /--.--\ `. . __ * - ."" '< `.___\_<|>_/___.' >'"". * - | | : `- \`.;`\ _ /`;.`/ - ` : | | * - \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * ............................................. * - Buddha bless never BUG * * @description:Detect orders that have not been paid within 15 minutes */ public class OrderCheck { private static final Logger logger = LoggerFactory.getLogger(OrderCheck.class); public static void main(String[] args) throws Exception { // 1. StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); executionEnvironment.setParallelism(1); executionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStreamSource<String> stringDataStreamSource = executionEnvironment.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/OrderLog.csv"); SingleOutputStreamOperator<OrderInfo> objectSingleOutputStreamOperator = stringDataStreamSource.map(line -> { String[] split = line.split(","); return new OrderInfo(split[0], split[1], split[2], Long.parseLong(split[3])); }).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<OrderInfo>() { @Override public long extractAscendingTimestamp(OrderInfo element) { return element.getTimeStamp()*1000L; } }); // 2 definition rules Pattern<OrderInfo, OrderInfo> orderPayPattern = Pattern.<OrderInfo>begin("create").where(new SimpleCondition<OrderInfo>() { @Override public boolean filter(OrderInfo value) throws Exception { return "create".equals(value.getStatus()); } }).followedBy("pay").where(new SimpleCondition<OrderInfo>() { @Override public boolean filter(OrderInfo value) throws Exception { return "pay".equals(value.getStatus()); } }).within(Time.minutes(15)); // 3. Matching mode PatternStream<OrderInfo> orderStream = CEP.pattern(objectSingleOutputStreamOperator.keyBy("orderId"), orderPayPattern); OutputTag<OrderTimeoutInfo> outputTag = new OutputTag<OrderTimeoutInfo>("timeoutStream") { }; // 4. Filter output matching and timeout events SingleOutputStreamOperator<OrderTimeoutInfo> resultStream = orderStream.select(outputTag, new OrderTimeoutSelect(), new OrderPaySelect()); resultStream.print("payed normally"); resultStream.getSideOutput(outputTag).print("timeout"); executionEnvironment.execute("order timeout detect job"); } /** * When will the timeout be judged: if there is no match or within the defined time range, the timeout will be judged * Where to output: the timeout event is output to the side output stream */ public static class OrderTimeoutSelect implements PatternTimeoutFunction<OrderInfo,OrderTimeoutInfo>{ @Override public OrderTimeoutInfo timeout(Map<String, List<OrderInfo>> pattern, long timeoutTimestamp) throws Exception { logger.error("rrrr_locker: get locker fail: key={}", pattern.toString()); OrderInfo OrderInfo = pattern.get("create").get(0); return new OrderTimeoutInfo(OrderInfo.getOrderId(),"timeout"+timeoutTimestamp); } } public static class OrderPaySelect implements PatternSelectFunction<OrderInfo, OrderTimeoutInfo>{ @Override public OrderTimeoutInfo select(Map<String, List<OrderInfo>> pattern) throws Exception { OrderInfo OrderInfo = pattern.get("pay").get(0); return new OrderTimeoutInfo(OrderInfo.getOrderId(),"pay"); } } }
2.4 payment real-time reconciliation
-
Double flow join
-
code
/** * - _ooOoo_ * - o8888888o * - 88" . "88 * - (| -_- |) * - O\ = /O * - ____/`---'\____ * - . ' \\| |// `. * - / \\||| : |||// \ * - / _||||| -:- |||||- \ * - | | \\\ - /// | | * - | \_| ''\---/'' | | * - \ .-\__ `-` ___/-. / * - ___`. .' /--.--\ `. . __ * - ."" '< `.___\_<|>_/___.' >'"". * - | | : `- \`.;`\ _ /`;.`/ - ` : | | * - \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * ............................................. * - Buddha bless never BUG * @author :LiangFangWei * @description: Check whether the order has arrived */ public class OrderPay { private final static OutputTag<OrderInfo> unmatchedPays = new OutputTag<OrderInfo>("unmatchedPays") { }; private final static OutputTag<Receipt> unmatchedReceipts = new OutputTag<Receipt>("unmatchedReceipts") { }; public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); //1. Payment data DataStreamSource<String> inputSteam1 = env.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/OrderLog.csv"); SingleOutputStreamOperator<OrderInfo> orderStream = inputSteam1.map(line -> { String[] split = line.split(","); return new OrderInfo(split[0], split[1], split[2], Long.parseLong(split[3])); }).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<OrderInfo>() { @Override public long extractAscendingTimestamp(OrderInfo element) { return element.getTimeStamp() * 1000L; } }); // 2. Entry data DataStreamSource<String> inputStream2 = env.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/OrderLog.csv"); SingleOutputStreamOperator<Receipt> payStream = inputStream2.map(line -> { String[] split = line.split(","); return new Receipt(split[0], split[1], Long.parseLong(split[2])); }).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<Receipt>() { @Override public long extractAscendingTimestamp(Receipt element) { return element.getTimeStamp() * 1000L; } }); // 3. Shuangli join SingleOutputStreamOperator<Tuple2<OrderInfo, Receipt>> resultStream = orderStream.keyBy("payId").connect(payStream.keyBy("payId")).process(new DoubleStreamJoinProcess()); // 4. If the join returns resultStream.print("matched"); resultStream.getSideOutput(unmatchedPays).print("unmatchedPays"); } public static class DoubleStreamJoinProcess extends CoProcessFunction<OrderInfo, Receipt, Tuple2<OrderInfo, Receipt>> { ValueState<OrderInfo> payState; ValueState<Receipt> receiptState; @Override public void open(Configuration parameters) throws Exception { payState = getRuntimeContext().getState(new ValueStateDescriptor<OrderInfo>("pay", OrderInfo.class)); receiptState = getRuntimeContext().getState(new ValueStateDescriptor<Receipt>("receipt", Receipt.class)); } @Override public void processElement1(OrderInfo orderInfo, Context ctx, Collector<Tuple2<OrderInfo, Receipt>> out) throws Exception { Receipt receipt = receiptState.value(); // Take out stream 2 if (receipt != null) { out.collect(new Tuple2<>(orderInfo, receipt)); receiptState.clear(); } else { payState.update(orderInfo); ctx.timerService().registerEventTimeTimer(orderInfo.getTimeStamp() * 1000L + 5000L); } } @Override public void processElement2(Receipt receipt, Context ctx, Collector<Tuple2<OrderInfo, Receipt>> out) throws Exception { // Take out stream 1 OrderInfo orderInfo = payState.value(); if (orderInfo != null) { out.collect(new Tuple2<>(orderInfo, receipt)); payState.clear(); } else { receiptState.update(receipt); ctx.timerService().registerEventTimeTimer(receipt.getTimeStamp() * 1000L + 5000L); } } @Override public void onTimer(long timestamp, OnTimerContext ctx, Collector<Tuple2<OrderInfo, Receipt>> out) throws Exception { if (payState.value() != null) { ctx.output(unmatchedPays, payState.value()); } if (receiptState.value() != null) { ctx.output(unmatchedReceipts, receiptState.value()); } payState.clear(); receiptState.clear(); super.onTimer(timestamp, ctx, out); } } }