Hallo, ich habe die folgenden Datenrahmen und wenn ich ihnen beitrete, erhalte ich AssertionError: on should be Column or list of Column. Wie kann ich das bitte umgehen, da ich in Google keine Lösung dazu finden kann?
Pages = sc.read.json("/Users/me/desktop/clickstream/Clicks/Page*.json.gz")
Pages_Dataset = Pages.select("SessionNumber", "PageLocation", "PageInstanceID")\
.withColumnRenamed("PageLocation", "URL")\
.withColumnRenamed("PageInstanceID", "CsaNumber")\
.withColumn("URL2", expr("CASE WHEN INSTR(URL, '=') > 0 THEN SUBSTR(URL,0,INSTR(URL, '=') -1) ELSE URL END"))\
.withColumn("URL2", expr("CASE WHEN INSTR(URL2, '?') > 0 THEN SUBSTR(URL2,0,INSTR(URL2, '?') -1) ELSE URL2 END"))\
.withColumn("URL2", expr("CASE WHEN INSTR(URL2, '#') > 0 THEN SUBSTR(URL2,0,INSTR(URL2, '#') -1) ELSE URL2 END"))\
.withColumn("URL3", expr("CASE WHEN INSTR(URL, 'prdcls=') > 0 THEN SUBSTR(URL,INSTR(URL, 'prdcls=')+7,2) ELSE '' END"))\
.withColumn("URL", concat("URL2", "URL3"))\
.select("SessionNumber", "URL", "CsaNumber").alias("a")\
.join(ConfiguredUrls.alias("b"), lower("a.URL") == lower("b.URL"), "left")\
.select("a.SessionNumber", "b.Product", "a.CsaNumber", "b.EndQuote", "a.URL")\
.withColumnRenamed("Product", "Session")\
.withColumn("Session", expr("CASE WHEN lower(URL) like 'https://mobilephones.com/deals/%' THEN 'Mobile Phones' ELSE Session END"))\
.withColumn("EndQuote", expr("CASE WHEN lower(URL) like 'https://mobilephones.com/deals/%' THEN 'Mobile Phones' ELSE EndQuote END"))\
.distinct()
Goals_Dataset = Goals.select("SessionNumber", "GoalName", "PageInstanceID", "EventTimestamp")\
.withColumnRenamed("EventTimestamp", "GoalDate")\
.withColumnRenamed("PageInstanceID", "CsaNumber")\
.select("SessionNumber", "GoalName", "CsaNumber", "GoalDate").alias("a")\
.join(ConfiguredGoals.alias("b"), lower("a.GoalName") == lower("b.GoalNameValue"), "left")\
.select("a.SessionNumber", coalesce("b.StartQuote", "b.EndQuote", "b.Switch").alias("Session"), "a.CsaNumber", "b.EndQuote")\
.distinct()
Session_Dataset = Pages_Dataset.select("SessionNumber", "Session", "CsaNumber", "EndQuote").alias("a")\
.join(Goals_Dataset.alias("b"), "a.SessionNumber" == "b.SessionNumber", "fullouter")\
.select(coalesce("a.SessionNumber", "b.SessionNumber").alias("SessionNumber"), coalesce("a.Session", "b.Session").alias("Session"), coalesce("a.CsaNumber", "b.CsaNumber").alias("CsaNumber"), coalesce("a.EndQuote", "b.EndQuote").alias("EndQuote"))\
.distinct()
#Error:
Session_Dataset = Pages_Dataset.select("SessionNumber", "Session", "CsaNumber", "EndQuote").alias("a")\
File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1343, in join
AssertionError: on should be Column or list of Column
Lösung des Problems
"a.SessionNumber" == "b.SessionNumber"
sein sollte col("a.SessionNumber") == col("b.SessionNumber")
, oder einfach"SessionNumber"
Keine Kommentare:
Kommentar veröffentlichen