Академический Документы
Профессиональный Документы
Культура Документы
____ _ _____ _ _ _
| _ \(_) | __ \ | | | (_)
| |_) |_ __ _| | | | __ _| |_ __ _ _ __ ___ __| |_ __ _
| _ <| |/ _` | | | |/ _` | __/ _` | '_ \ / _ \/ _` | |/ _` |
| |_) | | (_| | |__| | (_| | || (_| | |_) | __/ (_| | | (_| |
|____/|_|\__, |_____/ \__,_|\__\__,_| .__/ \___|\__,_|_|\__,_|
__/ | | |
|___/ |_|
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 1/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
In [30]: from pyspark.sql.types import (StructType, StructField, DateType, BooleanType, DoubleType, IntegerType, Stri
ngType, TimestampType)
crimes_schema = StructType([StructField("ID", StringType(), True),
StructField("CaseNumber", StringType(), True),
StructField("Date", StringType(), True ),
StructField("Block", StringType(), True),
StructField("IUCR", StringType(), True),
StructField("PrimaryType", StringType(), True ),
StructField("Description", StringType(), True ),
StructField("LocationDescription", StringType(), True ),
StructField("Arrest", BooleanType(), True),
StructField("Domestic", BooleanType(), True),
StructField("Beat", StringType(), True),
StructField("District", StringType(), True),
StructField("Ward", StringType(), True),
StructField("CommunityArea", StringType(), True),
StructField("FBICode", StringType(), True ),
StructField("XCoordinate", DoubleType(), True),
StructField("YCoordinate", DoubleType(), True ),
StructField("Year", IntegerType(), True),
StructField("UpdatedOn", DateType(), True ),
StructField("Latitude", DoubleType(), True),
StructField("Longitude", DoubleType(), True),
StructField("Location", StringType(), True )
])
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 2/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
+-------------------+
| Date_time|
+-------------------+
|2018-08-01 23:21:00|
|2001-01-01 11:00:00|
|2017-11-23 15:14:00|
|2017-11-28 21:43:00|
|2017-11-12 19:15:00|
+-------------------+
only showing top 5 rows
In [34]: # df.printSchema()
+-------+-------------------+-------------------+------------------+------------------+------------------+
|summary| Latitude| Longitude| Year| X_Coordinate| Y_Coordinate|
+-------+-------------------+-------------------+------------------+------------------+------------------+
| count| 994634| 994634| 1048575| 994634| 994634|
| mean| 41.843625077813975| -87.67166759336945|2004.0785122666475| 1164546.959224197|1886308.3875787475|
| stddev|0.09283909948294168|0.06428058629862175| 4.631923885824727|17835.072111265592|33725.190122034575|
| min| 36.6194464| -91.68656568| 2001| 0| 0|
| max| 42.02291033| -87.52452938| 2018| 1205119| 1951622|
+-------+-------------------+-------------------+------------------+------------------+------------------+
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 3/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
+-------+----------+----------+-------+------------+------------+
|summary| Latitude| Longitude| year|X_Coordinate|Y_Coordinate|
+-------+----------+----------+-------+------------+------------+
| count|994,634.00|994,634.00|1048575| 994,634.00| 994,634.00|
| mean| 41.84| -87.67| 2004|1,164,547.00|1,886,308.38|
| stddev| 0.09| 0.06| 4| 17,835.07| 33,725.19|
| min| 36.62| -91.69| 2001| 0.00| 0.00|
| max| 42.02| -87.52| 2018|1,205,119.00|1,951,622.00|
+-------+----------+----------+-------+------------+------------+
Out[37]: 34
Usecases
Out[53]: 8219
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 4/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
Out[39]: 14578
+-----------------+-----------------+
|max(X_Coordinate)|min(X_Coordinate)|
+-----------------+-----------------+
| 1205119| 0|
+-----------------+-----------------+
+------------------+
| Mean_Latitude|
+------------------+
|41.843625077813975|
+------------------+
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 5/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
In [42]: df.groupBy("Year").count().orderBy("Year").show()
# df.groupBy("Year").count().collect()
+----+------+
|Year| count|
+----+------+
|2001|481933|
|2002|148966|
|2003| 1211|
|2004| 1797|
|2005|164029|
|2006|103473|
|2007| 4025|
|2008| 5569|
|2009| 46306|
|2010| 510|
|2011| 512|
|2012| 595|
|2013| 545|
|2014| 1247|
|2015| 13042|
|2016| 2605|
|2017| 24029|
|2018| 48181|
+----+------+
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 6/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
monthdf = df.withColumn("Month",month("Date_time"))
monthCounts = monthdf.select("Month").groupBy("Month").count()
monthCounts = monthCounts.collect()
months = [item[0] for item in monthCounts]
count = [item[1] for item in monthCounts]
crimes_per_month = {"month":months, "crime_count": count}
crimes_per_month = pd.DataFrame(crimes_per_month)
crimes_per_month = crimes_per_month.sort_values(by = "month")
crimes_per_month.plot(figsize = (20,10), kind = "line", x = "month", y = "crime_count",
color = "red", linewidth = 8, legend = False)
plt.xlabel("Month", fontsize = 18)
plt.ylabel("Number of Crimes", fontsize = 18)
plt.title("Number of Crimes Per Month", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 7/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 8/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
monthdf = df.withColumn("Month",month("Date_time"))
monthdf.select("Year", "Month").groupBy("Year", "Month").count().orderBy("Year", "Month").show()
+----+-----+-----+
|Year|Month|count|
+----+-----+-----+
|2001| 1|41523|
|2001| 2|34969|
|2001| 3|40334|
|2001| 4|39639|
|2001| 5|40709|
|2001| 6|40842|
|2001| 7|43494|
|2001| 8|42569|
|2001| 9|40255|
|2001| 10|41769|
|2001| 11|38744|
|2001| 12|37086|
|2002| 1|23235|
|2002| 2|19681|
|2002| 3|22772|
|2002| 4|22632|
|2002| 5|24733|
|2002| 6| 5278|
|2002| 7| 5184|
|2002| 8| 5370|
+----+-----+-----+
only showing top 20 rows
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 9/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
+--------------------+------+
|Location_Description| count|
+--------------------+------+
| STREET|303380|
| RESIDENCE|191271|
| APARTMENT| 82113|
| SIDEWALK| 78890|
| OTHER| 42807|
|PARKING LOT/GARAG...| 31730|
|SCHOOL, PUBLIC, B...| 23336|
| ALLEY| 23326|
| RESIDENCE-GARAGE| 21308|
|RESIDENCE PORCH/H...| 18521|
| SMALL RETAIL STORE| 17537|
| GROCERY FOOD STORE| 16556|
| RESTAURANT| 15755|
|VEHICLE NON-COMME...| 15481|
|CHA PARKING LOT/G...| 14915|
| DEPARTMENT STORE| 13216|
| GAS STATION| 10874|
|COMMERCIAL / BUSI...| 9875|
| CHA APARTMENT| 9581|
| PARK PROPERTY| 7381|
+--------------------+------+
only showing top 20 rows
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 10/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 11/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 12/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().show()
+---------+----------------+------+
|DayOfWeek|DayOfWeek_number| count|
+---------+----------------+------+
| Fri| 5|151654|
| Wed| 3|151630|
| Sat| 6|150399|
| Tue| 2|151330|
| Mon| 1|151550|
| Sun| 7|142815|
| Thu| 4|149197|
+---------+----------------+------+
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 13/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 14/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 15/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
+---------+-----+
|HourOfDay|count|
+---------+-----+
| 12| 5955|
| 22| 7761|
| 1| 5779|
| 13| 5198|
| 16| 6046|
| 6| 2271|
| 3| 3915|
| 20| 7250|
| 5| 2365|
| 19| 6874|
| 15| 5813|
| 17| 6228|
| 9| 4887|
| 4| 2967|
| 8| 4172|
| 23| 7361|
| 7| 2984|
| 10| 5210|
| 21| 7689|
| 11| 5118|
+---------+-----+
only showing top 20 rows
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 16/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 17/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 18/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
+---------+---------+-----+
|DayOfWeek|HourOfDay|count|
+---------+---------+-----+
| Sat| 14| 886|
| Tue| 15| 822|
| Fri| 23| 1087|
| Fri| 12| 798|
| Tue| 21| 1095|
| Thu| 14| 715|
| Sun| 13| 809|
| Mon| 22| 1114|
| Tue| 16| 854|
| Sun| 4| 611|
| Fri| 6| 286|
| Thu| 8| 611|
| Wed| 16| 810|
| Tue| 6| 332|
| Mon| 4| 321|
| Thu| 10| 686|
| Mon| 2| 644|
| Thu| 7| 381|
| Thu| 20| 964|
| Tue| 18| 963|
+---------+---------+-----+
only showing top 20 rows
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 19/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 20/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 21/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 22/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML
In [ ]:
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 23/23