#import pyhdfs
#顶级项目目录
#from spark.demo import demo2
#! /usr/bin/python
# -*- coding:utf-8 -*-
import sys
from pyspark.sql import SparkSession
import operator as op
class PropertiesUtil:
def __init__(self):
print("a")
def formatPrint(x):
strRes =str()
'''
if isinstance(x,tuple):
print ("true")
else :
print("false")
'''
if "" != x :
strRes = str(x).split(",")
else:
""
return strRes
if __name__ =="__main__":
#自动调用init方法
d = PropertiesUtil()
#print("b")
#hdfsClinet = pyhdfs.HdfsClient
#flag = pyhdfs.HdfsClient.exists()
#conf = SparkConf().setMaster("local[*]").setAppName("Test")
#sc = SparkContext(conf)
spark = SparkSession.builder\
.master("local") \
.appName("Test") \
.enableHiveSupport() \
.config("spark.executor.memory","1g") \
.getOrCreate()
sc = spark.sparkContext
rdd = sc.textFile("hdfs://hadoop:9000/dev/nginx/logs/nongfu.mw/status/mergeonlinefile/2018-07-17")
#rdd.foreach(lambda x: print(x))
a = ""
b = ""
res = str()
#rdd.foreach(lambda x: d.formatPrint())
#rdd.top(1)
iterator = rdd.toLocalIterator()
#获取保存状态的小文件数据
for i in iterator:
a = str(i).split(",")[0]
b = str(i).split(",")[1]
print('a: ',a," ,b: ",b)
#复杂处理
rdd.foreach(d.formatPrint)