pySpark sql 활용하여 계층데이터 처리하기 CTE
from pyspark.sql.functions import broadcast df = sqlContext.createDataFrame([(1, 2), (2, 3), (3, 4), (4, 5), (6, 7), (7, 8),(9, 10)], "OldID integer,NewID integer") dfcheck = df.drop('NewID') dfdistinctID = df.select('NewID').distinct() dfidfinal = dfdistinctID.join(dfcheck, [dfcheck.OldID == dfdistinctID.NewID], how="left_anti") #We find the IDs that have not been replaced # 마지막에 해당하는 정보 찾기 dfc..