Вы находитесь на странице: 1из 23

10/16/2019 Bigdatapedia_crimedataset_pysparkML

Working with crimedataset via pyspark & ML


In [28]: from bigdatapedia import BigDatapedia
BigDatapedia.logo_terminal("BigDatapedia")

____ _ _____ _ _ _
| _ \(_) | __ \ | | | (_)
| |_) |_ __ _| | | | __ _| |_ __ _ _ __ ___ __| |_ __ _
| _ <| |/ _` | | | |/ _` | __/ _` | '_ \ / _ \/ _` | |/ _` |
| |_) | | (_| | |__| | (_| | || (_| | |_) | __/ (_| | | (_| |
|____/|_|\__, |_____/ \__,_|\__\__,_| .__/ \___|\__,_|_|\__,_|
__/ | | |
|___/ |_|

In [29]: import pyspark


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Pyspark_SampleProject").getOrCreate()

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 1/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [30]: from pyspark.sql.types import (StructType, StructField, DateType, BooleanType, DoubleType, IntegerType, Stri
ngType, TimestampType)
crimes_schema = StructType([StructField("ID", StringType(), True),
StructField("CaseNumber", StringType(), True),
StructField("Date", StringType(), True ),
StructField("Block", StringType(), True),
StructField("IUCR", StringType(), True),
StructField("PrimaryType", StringType(), True ),
StructField("Description", StringType(), True ),
StructField("LocationDescription", StringType(), True ),
StructField("Arrest", BooleanType(), True),
StructField("Domestic", BooleanType(), True),
StructField("Beat", StringType(), True),
StructField("District", StringType(), True),
StructField("Ward", StringType(), True),
StructField("CommunityArea", StringType(), True),
StructField("FBICode", StringType(), True ),
StructField("XCoordinate", DoubleType(), True),
StructField("YCoordinate", DoubleType(), True ),
StructField("Year", IntegerType(), True),
StructField("UpdatedOn", DateType(), True ),
StructField("Latitude", DoubleType(), True),
StructField("Longitude", DoubleType(), True),
StructField("Location", StringType(), True )
])

In [31]: crimes = spark.read.csv("file:///D:/00STUDIES/00ANALYTICS/00Hadoop/1UBUNTU_SHARED_Folder/USE_CASES/Crime_dat


a/Crimes_2001_sample.csv",
header = True,inferSchema=True)

In [32]: print(" The crimes dataframe has {} records".format(crimes.count()))

The crimes dataframe has 1048575 records

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 2/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [33]: from datetime import datetime


from pyspark.sql.functions import col,udf
myfunc = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S'), TimestampType())
df = crimes.withColumn('Date_time', myfunc(col('Date'))).drop("Date")
df.select(df["Date_time"]).show(5)

+-------------------+
| Date_time|
+-------------------+
|2018-08-01 23:21:00|
|2001-01-01 11:00:00|
|2017-11-23 15:14:00|
|2017-11-28 21:43:00|
|2017-11-12 19:15:00|
+-------------------+
only showing top 5 rows

In [34]: # df.printSchema()

In [35]: # Calculate statistics of numeric and string columns


crimes.select(["Latitude","Longitude","Year","X_Coordinate","Y_Coordinate"]).describe().show()

+-------+-------------------+-------------------+------------------+------------------+------------------+
|summary| Latitude| Longitude| Year| X_Coordinate| Y_Coordinate|
+-------+-------------------+-------------------+------------------+------------------+------------------+
| count| 994634| 994634| 1048575| 994634| 994634|
| mean| 41.843625077813975| -87.67166759336945|2004.0785122666475| 1164546.959224197|1886308.3875787475|
| stddev|0.09283909948294168|0.06428058629862175| 4.631923885824727|17835.072111265592|33725.190122034575|
| min| 36.6194464| -91.68656568| 2001| 0| 0|
| max| 42.02291033| -87.52452938| 2018| 1205119| 1951622|
+-------+-------------------+-------------------+------------------+------------------+------------------+

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 3/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [36]: from pyspark.sql.functions import format_number


result = crimes.select(["Latitude","Longitude","Year","X_Coordinate","Y_Coordinate"]).describe()
result.select(result['summary'],
format_number(result['Latitude'].cast('float'),2).alias('Latitude'),
format_number(result['Longitude'].cast('float'),2).alias('Longitude'),
result['Year'].cast('int').alias('year'),
format_number(result['X_Coordinate'].cast('float'),2).alias('X_Coordinate'),
format_number(result['Y_Coordinate'].cast('float'),2).alias('Y_Coordinate')
).show()

+-------+----------+----------+-------+------------+------------+
|summary| Latitude| Longitude| year|X_Coordinate|Y_Coordinate|
+-------+----------+----------+-------+------------+------------+
| count|994,634.00|994,634.00|1048575| 994,634.00| 994,634.00|
| mean| 41.84| -87.67| 2004|1,164,547.00|1,886,308.38|
| stddev| 0.09| 0.06| 4| 17,835.07| 33,725.19|
| min| 36.62| -91.69| 2001| 0.00| 0.00|
| max| 42.02| -87.52| 2018|1,205,119.00|1,951,622.00|
+-------+----------+----------+-------+------------+------------+

In [37]: # How many primary crime types are there?


crimes.select("Primary_Type").distinct().count()

Out[37]: 34

Usecases

1. How many homicides are there in the dataset?

In [53]: crimes.where(crimes["Primary_Type"] == "HOMICIDE").count()

Out[53]: 8219

2. how many domestic assualts there are?

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 4/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [39]: crimes.filter((crimes["Primary_Type"] == "ASSAULT") & (crimes["Domestic"] == "True")).count()

Out[39]: 14578

In [40]: # Create a new column with withColumn


lat_max = crimes.agg({"Latitude" : "max"}).collect()[0][0]
print("The maximum latitude values is {}".format(lat_max))

The maximum latitude values is 42.02291033

In [41]: from pyspark.sql.functions import max,min


df.select(max("X_Coordinate"),min("X_Coordinate")).show()

from pyspark.sql.functions import mean


df.select(mean("Latitude").alias("Mean_Latitude")).show()

+-----------------+-----------------+
|max(X_Coordinate)|min(X_Coordinate)|
+-----------------+-----------------+
| 1205119| 0|
+-----------------+-----------------+

+------------------+
| Mean_Latitude|
+------------------+
|41.843625077813975|
+------------------+

3. Find the number of crimes per year

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 5/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [42]: df.groupBy("Year").count().orderBy("Year").show()
# df.groupBy("Year").count().collect()

+----+------+
|Year| count|
+----+------+
|2001|481933|
|2002|148966|
|2003| 1211|
|2004| 1797|
|2005|164029|
|2006|103473|
|2007| 4025|
|2008| 5569|
|2009| 46306|
|2010| 510|
|2011| 512|
|2012| 595|
|2013| 545|
|2014| 1247|
|2015| 13042|
|2016| 2605|
|2017| 24029|
|2018| 48181|
+----+------+

4. Plot number of crimes by month

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 6/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [43]: import pandas as pd


import matplotlib.pyplot as plt
%matplotlib inline
from pyspark.sql.functions import month

monthdf = df.withColumn("Month",month("Date_time"))
monthCounts = monthdf.select("Month").groupBy("Month").count()
monthCounts = monthCounts.collect()
months = [item[0] for item in monthCounts]
count = [item[1] for item in monthCounts]
crimes_per_month = {"month":months, "crime_count": count}
crimes_per_month = pd.DataFrame(crimes_per_month)
crimes_per_month = crimes_per_month.sort_values(by = "month")
crimes_per_month.plot(figsize = (20,10), kind = "line", x = "month", y = "crime_count",
color = "red", linewidth = 8, legend = False)
plt.xlabel("Month", fontsize = 18)
plt.ylabel("Number of Crimes", fontsize = 18)
plt.title("Number of Crimes Per Month", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 7/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 8/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [44]: from pyspark.sql.functions import month

monthdf = df.withColumn("Month",month("Date_time"))
monthdf.select("Year", "Month").groupBy("Year", "Month").count().orderBy("Year", "Month").show()

+----+-----+-----+
|Year|Month|count|
+----+-----+-----+
|2001| 1|41523|
|2001| 2|34969|
|2001| 3|40334|
|2001| 4|39639|
|2001| 5|40709|
|2001| 6|40842|
|2001| 7|43494|
|2001| 8|42569|
|2001| 9|40255|
|2001| 10|41769|
|2001| 11|38744|
|2001| 12|37086|
|2002| 1|23235|
|2002| 2|19681|
|2002| 3|22772|
|2002| 4|22632|
|2002| 5|24733|
|2002| 6| 5278|
|2002| 7| 5184|
|2002| 8| 5370|
+----+-----+-----+
only showing top 20 rows

5. Where do most crimes take pace?

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 9/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [45]: from pyspark.sql.functions import col


crimes.groupBy("Location_Description").count().orderBy(col("count").desc()).show()

+--------------------+------+
|Location_Description| count|
+--------------------+------+
| STREET|303380|
| RESIDENCE|191271|
| APARTMENT| 82113|
| SIDEWALK| 78890|
| OTHER| 42807|
|PARKING LOT/GARAG...| 31730|
|SCHOOL, PUBLIC, B...| 23336|
| ALLEY| 23326|
| RESIDENCE-GARAGE| 21308|
|RESIDENCE PORCH/H...| 18521|
| SMALL RETAIL STORE| 17537|
| GROCERY FOOD STORE| 16556|
| RESTAURANT| 15755|
|VEHICLE NON-COMME...| 15481|
|CHA PARKING LOT/G...| 14915|
| DEPARTMENT STORE| 13216|
| GAS STATION| 10874|
|COMMERCIAL / BUSI...| 9875|
| CHA APARTMENT| 9581|
| PARK PROPERTY| 7381|
+--------------------+------+
only showing top 20 rows

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 10/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [46]: crime_location = crimes.groupBy("Location_Description").count().collect()


location = [item[0] for item in crime_location]
count = [item[1] for item in crime_location]
crime_location = {"location" : location, "count": count}
crime_location = pd.DataFrame(crime_location)
crime_location = crime_location.sort_values(by = "count", ascending = False)
crime_location = crime_location.iloc[:20]
myplot = crime_location .plot(figsize = (20,20), kind = "barh", color = "#b35900", width = 0.8,
x = "location", y = "count", legend = False)
myplot.invert_yaxis()
plt.xlabel("Number of crimes", fontsize = 28)
plt.ylabel("Crime Location", fontsize = 28)
plt.title("Number of Crimes By Location", fontsize = 36)
plt.xticks(size = 24)
plt.yticks(size = 24)
plt.show()

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 11/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

6.Which days have the highest number of crimes?

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 12/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [47]: from pyspark.sql.functions import date_format


df = df.withColumn("DayOfWeek", date_format("Date_time","E")).\
withColumn("DayOfWeek_number", date_format("Date_time","u")).\
withColumn("HourOfDay", date_format("Date_time","H"))

df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().show()

+---------+----------------+------+
|DayOfWeek|DayOfWeek_number| count|
+---------+----------------+------+
| Fri| 5|151654|
| Wed| 3|151630|
| Sat| 6|150399|
| Tue| 2|151330|
| Mon| 1|151550|
| Sun| 7|142815|
| Thu| 4|149197|
+---------+----------------+------+

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 13/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [48]: weekDaysCount = df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().collect()


days = [item[0] for item in weekDaysCount]
count = [item[2] for item in weekDaysCount]
day_number = [item[1] for item in weekDaysCount]
crime_byDay = {"days" : days, "count": count, "day_number": day_number}
crime_byDay = pd.DataFrame(crime_byDay)
crime_byDay = crime_byDay.sort_values(by = "day_number", ascending = True)

crime_byDay.plot(figsize = (20,10), kind = "line", x = "days", y = "count",


color = "red", linewidth = 8, legend = False)

plt.ylabel("Number of Crimes", fontsize = 18)


plt.xlabel("")
plt.title("Number of Crimes by Day", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 14/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

7. Number of domestic crimes by hour

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 15/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [49]: temp = df.filter(df["Domestic"] == "true")


temp = temp.select(df['HourOfDay'].cast('int').alias('HourOfDay'))
temp.groupBy(["HourOfDay"]).count().show()

+---------+-----+
|HourOfDay|count|
+---------+-----+
| 12| 5955|
| 22| 7761|
| 1| 5779|
| 13| 5198|
| 16| 6046|
| 6| 2271|
| 3| 3915|
| 20| 7250|
| 5| 2365|
| 19| 6874|
| 15| 5813|
| 17| 6228|
| 9| 4887|
| 4| 2967|
| 8| 4172|
| 23| 7361|
| 7| 2984|
| 10| 5210|
| 21| 7689|
| 11| 5118|
+---------+-----+
only showing top 20 rows

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 16/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [50]: temp = df.filter(df["Domestic"] == "true")


temp = temp.select(df['HourOfDay'].cast('int').alias('HourOfDay'))
hourlyCount = temp.groupBy(["HourOfDay"]).count().collect()

hours = [item[0] for item in hourlyCount]


count = [item[1] for item in hourlyCount]

crime_byHour = {"count": count, "hours": hours}


crime_byHour = pd.DataFrame(crime_byHour)
crime_byHour = crime_byHour.sort_values(by = "hours", ascending = True)

crime_byHour.plot(figsize = (20,10), kind = "line", x = "hours", y = "count",


color = "green", linewidth = 5, legend = False)

plt.ylabel("Number of Domestic Crimes", fontsize = 18)


plt.xlabel("Hour", fontsize = 18)
plt.title("Number of domestic crimes by hour", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 17/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

8. number of domestic crimes by day and hour?

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 18/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [51]: temp = df.filter(df["Domestic"] == "true")


temp = temp.select("DayOfWeek", df['HourOfDay'].cast('int').alias('HourOfDay'))
temp.groupBy(["DayOfWeek","HourOfDay"]).count().show()

+---------+---------+-----+
|DayOfWeek|HourOfDay|count|
+---------+---------+-----+
| Sat| 14| 886|
| Tue| 15| 822|
| Fri| 23| 1087|
| Fri| 12| 798|
| Tue| 21| 1095|
| Thu| 14| 715|
| Sun| 13| 809|
| Mon| 22| 1114|
| Tue| 16| 854|
| Sun| 4| 611|
| Fri| 6| 286|
| Thu| 8| 611|
| Wed| 16| 810|
| Tue| 6| 332|
| Mon| 4| 321|
| Thu| 10| 686|
| Mon| 2| 644|
| Thu| 7| 381|
| Thu| 20| 964|
| Tue| 18| 963|
+---------+---------+-----+
only showing top 20 rows

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 19/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [52]: import seaborn as sns


temp = df.filter(df["Domestic"] == "true")
temp = temp.select("DayOfWeek", df['HourOfDay'].cast('int').alias('HourOfDay'))
hourlyCount = temp.groupBy(["DayOfWeek","HourOfDay"]).count().collect()

days = [item[0] for item in hourlyCount]


hours = [item[1] for item in hourlyCount]
count = [item[2] for item in hourlyCount]
crime_byHour = {"count": count, "hours": hours, "days": days}
crime_byHour = pd.DataFrame(crime_byHour)
crime_byHour = crime_byHour.sort_values(by = "hours", ascending = True)
import seaborn as sns
g = sns.FacetGrid(crime_byHour, hue="days", size = 12)
g.map(plt.plot, "hours", "count", linewidth = 3)
g.add_legend()
plt.ylabel("Number of Domestic Crimes", fontsize = 18)
plt.xlabel("Hour", fontsize = 18)
plt.title("Number of domestic crimes by day and hour", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 20/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

C:\Users\HP\Anaconda3\lib\site-packages\seaborn\axisgrid.py:230: UserWarning: The `size` paramter has been re


named to `height`; please update your code.
warnings.warn(msg, UserWarning)

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 21/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 22/23
10/16/2019 Bigdatapedia_crimedataset_pysparkML

In [ ]:

localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 23/23

Вам также может понравиться