log_of_songs = [ "Despacito", "Nice for what", "No tears left to cry", "Despacito", "Havana", "In my feelings", "Nice for what", "Despacito", "All the stars" ] play_count = 0 def count_plays(song_title): global play_count for song in log_of_songs: if song == song_title: play_count = play_count + 1 return play_count
from pyspark import SparkContext log_of_songs = [ "Despacito", "Nice for what", "No tears left to cry", "Despacito", "Havana", "In my feelings", "Nice for what", "Despacito", "All the stars" ]
from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import StringType from pyspark.sql.types import IntegerType from pyspark.sql.functions import desc from pyspark.sql.functions import asc from pyspark.sql.functions import sum as Fsum
import datetime
import numpy as np import pandas as pd %matplotlib inline import matplotlib.pyplot as plt
from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import StringType from pyspark.sql.types import IntegerType from pyspark.sql.functions import desc from pyspark.sql.functions import asc from pyspark.sql.functions import sum as Fsum
import datetime
import numpy as np import pandas as pd %matplotlib inline import matplotlib.pyplot as plt
from pyspark.sql import SparkSession from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType
import re # create a SparkSession: note this step was left out of the screencast spark = SparkSession.builder \ .master("local") \ .appName("Word Count") \ .getOrCreate()
# 如何读取数据集 stack_overflow_data = 'Train_onetag_small.json' df = spark.read.json(stack_overflow_data) df.head() # 把字符串分为单独的单词。Spark有一个[Tokenizer](https://spark.apache.org/docs/latest/ml-features.html#tokenizer) 类以及RegexTokenizer。 后者在分词时有更大的自由度。 # split the body text into separate words regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W") df = regexTokenizer.transform(df) df.head() # count the number of words in each body tag body_length = udf(lambda x: len(x), IntegerType()) df = df.withColumn("BodyLength", body_length(df.words)) # count the number of paragraphs and links in each body tag number_of_paragraphs = udf(lambda x: len(re.findall("</p>", x)), IntegerType()) number_of_links = udf(lambda x: len(re.findall("</a>", x)), IntegerType()) df = df.withColumn("NumParagraphs", number_of_paragraphs(df.Body)) df = df.withColumn("NumLinks", number_of_links(df.Body)) df.head(2) # 将内容长度,段落数和内容中的链接数合并为一个向量 assembler = VectorAssembler(inputCols=["BodyLength", "NumParagraphs", "NumLinks"], outputCol="NumFeatures") df = assembler.transform(df) df.head() # 归一化向量 scaler = Normalizer(inputCol="NumFeatures", outputCol="ScaledNumFeatures") df = scaler.transform(df) df.head(2) # 缩放向量 scaler2 = StandardScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures2", withStd=True) scalerModel = scaler2.fit(df) df = scalerModel.transform(df) df.head(2)
from pyspark.sql import SparkSession from pyspark.ml.feature import RegexTokenizer, CountVectorizer, \ IDF, StringIndexer from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType
import re # create a SparkSession: note this step was left out of the screencast spark = SparkSession.builder \ .master("local") \ .appName("Word Count") \ .getOrCreate()
# 分词将字符串拆分为单独的单词。Spark 有一个[Tokenizer] (https://spark.apache.org/docs/latest/ml-features.html#tokenizer) 类和RegexTokenizer。后者在分词时有更大的自由度 。 # split the body text into separate words regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W") df = regexTokenizer.transform(df) df.head()
# CountVectorizer # find the term frequencies of the words cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=1000) cvmodel = cv.fit(df) df = cvmodel.transform(df) df.take(1) # show the vocabulary in order of cvmodel.vocabulary # show the last 10 terms in the vocabulary cvmodel.vocabulary[-10:]