Working with crimedataset via pyspark & ML

In [28]: from bigdatapedia import BigDatapedia

In [29]: import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Pyspark_SampleProject").getOrCreate()

In [30]: from pyspark.sql.types import (StructType, StructField, DateType, BooleanType, DoubleType, IntegerType, Stri
ngType, TimestampType)
crimes_schema = StructType([StructField("ID", StringType(), True),
StructField("CaseNumber", StringType(), True),
StructField("Date", StringType(), True ),
StructField("Block", StringType(), True),
StructField("IUCR", StringType(), True),
StructField("PrimaryType", StringType(), True ),
StructField("Description", StringType(), True ),
StructField("LocationDescription", StringType(), True ),
StructField("Arrest", BooleanType(), True),
StructField("Domestic", BooleanType(), True),
StructField("Beat", StringType(), True),
StructField("District", StringType(), True),
StructField("Ward", StringType(), True),
StructField("CommunityArea", StringType(), True),
StructField("FBICode", StringType(), True ),
StructField("XCoordinate", DoubleType(), True),
StructField("YCoordinate", DoubleType(), True ),
StructField("Year", IntegerType(), True),
StructField("UpdatedOn", DateType(), True ),
StructField("Latitude", DoubleType(), True),
StructField("Longitude", DoubleType(), True),
StructField("Location", StringType(), True )

In [31]: crimes = spark.read.csv("file:///D:/00STUDIES/00ANALYTICS/00Hadoop/1UBUNTU_SHARED_Folder/USE_CASES/Crime_dat

header = True,inferSchema=True)

In [32]: print(" The crimes dataframe has {} records".format(crimes.count()))

The crimes dataframe has 1048575 records

In [33]: from datetime import datetime

from pyspark.sql.functions import col,udf
myfunc = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S'), TimestampType())
df = crimes.withColumn('Date_time', myfunc(col('Date'))).drop("Date")

| Date_time|
|2018-08-01 23:21:00|
|2001-01-01 11:00:00|
|2017-11-23 15:14:00|
|2017-11-28 21:43:00|
|2017-11-12 19:15:00|
only showing top 5 rows

In [34]: # df.printSchema()

In [35]: # Calculate statistics of numeric and string columns


|summary| Latitude| Longitude| Year| X_Coordinate| Y_Coordinate|
| count| 994634| 994634| 1048575| 994634| 994634|
| mean| 41.843625077813975| -87.67166759336945|2004.0785122666475| 1164546.959224197|1886308.3875787475|
| stddev|0.09283909948294168|0.06428058629862175| 4.631923885824727|17835.072111265592|33725.190122034575|
| min| 36.6194464| -91.68656568| 2001| 0| 0|
| max| 42.02291033| -87.52452938| 2018| 1205119| 1951622|

In [36]: from pyspark.sql.functions import format_number

result = crimes.select(["Latitude","Longitude","Year","X_Coordinate","Y_Coordinate"]).describe()

|summary| Latitude| Longitude| year|X_Coordinate|Y_Coordinate|
| count|994,634.00|994,634.00|1048575| 994,634.00| 994,634.00|
| mean| 41.84| -87.67| 2004|1,164,547.00|1,886,308.38|
| stddev| 0.09| 0.06| 4| 17,835.07| 33,725.19|
| min| 36.62| -91.69| 2001| 0.00| 0.00|
| max| 42.02| -87.52| 2018|1,205,119.00|1,951,622.00|

In [37]: # How many primary crime types are there?


Out[37]: 34


1. How many homicides are there in the dataset?

In [53]: crimes.where(crimes["Primary_Type"] == "HOMICIDE").count()

Out[53]: 8219

2. how many domestic assualts there are?

In [39]: crimes.filter((crimes["Primary_Type"] == "ASSAULT") & (crimes["Domestic"] == "True")).count()

Out[39]: 14578

In [40]: # Create a new column with withColumn

lat_max = crimes.agg({"Latitude" : "max"}).collect()[0][0]
print("The maximum latitude values is {}".format(lat_max))

The maximum latitude values is 42.02291033

In [41]: from pyspark.sql.functions import max,min


from pyspark.sql.functions import mean


| 1205119| 0|

| Mean_Latitude|

3. Find the number of crimes per year

In [42]: df.groupBy("Year").count().orderBy("Year").show()
# df.groupBy("Year").count().collect()

|Year| count|
|2003| 1211|
|2004| 1797|
|2007| 4025|
|2008| 5569|
|2009| 46306|
|2010| 510|
|2011| 512|
|2012| 595|
|2013| 545|
|2014| 1247|
|2015| 13042|
|2016| 2605|
|2017| 24029|
|2018| 48181|

4. Plot number of crimes by month

In [43]: import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
from pyspark.sql.functions import month

monthdf = df.withColumn("Month",month("Date_time"))
monthCounts = monthdf.select("Month").groupBy("Month").count()
monthCounts = monthCounts.collect()
months = [item[0] for item in monthCounts]
count = [item[1] for item in monthCounts]
crimes_per_month = {"month":months, "crime_count": count}
crimes_per_month = pd.DataFrame(crimes_per_month)
crimes_per_month = crimes_per_month.sort_values(by = "month")
crimes_per_month.plot(figsize = (20,10), kind = "line", x = "month", y = "crime_count",
color = "red", linewidth = 8, legend = False)
plt.xlabel("Month", fontsize = 18)
plt.ylabel("Number of Crimes", fontsize = 18)
plt.title("Number of Crimes Per Month", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)

In [44]: from pyspark.sql.functions import month

monthdf = df.withColumn("Month",month("Date_time"))
monthdf.select("Year", "Month").groupBy("Year", "Month").count().orderBy("Year", "Month").show()

|2001| 1|41523|
|2001| 2|34969|
|2001| 3|40334|
|2001| 4|39639|
|2001| 5|40709|
|2001| 6|40842|
|2001| 7|43494|
|2001| 8|42569|
|2001| 9|40255|
|2001| 10|41769|
|2001| 11|38744|
|2001| 12|37086|
|2002| 1|23235|
|2002| 2|19681|
|2002| 3|22772|
|2002| 4|22632|
|2002| 5|24733|
|2002| 6| 5278|
|2002| 7| 5184|
|2002| 8| 5370|
only showing top 20 rows

5. Where do most crimes take pace?

In [45]: from pyspark.sql.functions import col


|Location_Description| count|
| STREET|303380|
| RESIDENCE|191271|
| APARTMENT| 82113|
| SIDEWALK| 78890|
| OTHER| 42807|
|SCHOOL, PUBLIC, B...| 23336|
| ALLEY| 23326|
| RESTAURANT| 15755|
|CHA PARKING LOT/G...| 14915|
| GAS STATION| 10874|
|COMMERCIAL / BUSI...| 9875|
only showing top 20 rows

In [46]: crime_location = crimes.groupBy("Location_Description").count().collect()

location = [item[0] for item in crime_location]
count = [item[1] for item in crime_location]
crime_location = {"location" : location, "count": count}
crime_location = pd.DataFrame(crime_location)
crime_location = crime_location.sort_values(by = "count", ascending = False)
crime_location = crime_location.iloc[:20]
myplot = crime_location .plot(figsize = (20,20), kind = "barh", color = "#b35900", width = 0.8,
x = "location", y = "count", legend = False)
plt.xlabel("Number of crimes", fontsize = 28)
plt.ylabel("Crime Location", fontsize = 28)
plt.title("Number of Crimes By Location", fontsize = 36)
plt.xticks(size = 24)
plt.yticks(size = 24)

6.Which days have the highest number of crimes?

In [47]: from pyspark.sql.functions import date_format

df = df.withColumn("DayOfWeek", date_format("Date_time","E")).\
withColumn("DayOfWeek_number", date_format("Date_time","u")).\
withColumn("HourOfDay", date_format("Date_time","H"))

df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().show()

|DayOfWeek|DayOfWeek_number| count|
| Fri| 5|151654|
| Wed| 3|151630|
| Sat| 6|150399|
| Tue| 2|151330|
| Mon| 1|151550|
| Sun| 7|142815|
| Thu| 4|149197|

In [48]: weekDaysCount = df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().collect()

days = [item[0] for item in weekDaysCount]
count = [item[2] for item in weekDaysCount]
day_number = [item[1] for item in weekDaysCount]
crime_byDay = {"days" : days, "count": count, "day_number": day_number}
crime_byDay = pd.DataFrame(crime_byDay)
crime_byDay = crime_byDay.sort_values(by = "day_number", ascending = True)

crime_byDay.plot(figsize = (20,10), kind = "line", x = "days", y = "count",

color = "red", linewidth = 8, legend = False)

plt.ylabel("Number of Crimes", fontsize = 18)

plt.title("Number of Crimes by Day", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)

7. Number of domestic crimes by hour

In [49]: temp = df.filter(df["Domestic"] == "true")

temp = temp.select(df['HourOfDay'].cast('int').alias('HourOfDay'))

| 12| 5955|
| 22| 7761|
| 1| 5779|
| 13| 5198|
| 16| 6046|
| 6| 2271|
| 3| 3915|
| 20| 7250|
| 5| 2365|
| 19| 6874|
| 15| 5813|
| 17| 6228|
| 9| 4887|
| 4| 2967|
| 8| 4172|
| 23| 7361|
| 7| 2984|
| 10| 5210|
| 21| 7689|
| 11| 5118|
only showing top 20 rows

In [50]: temp = df.filter(df["Domestic"] == "true")

temp = temp.select(df['HourOfDay'].cast('int').alias('HourOfDay'))
hourlyCount = temp.groupBy(["HourOfDay"]).count().collect()

hours = [item[0] for item in hourlyCount]

count = [item[1] for item in hourlyCount]

crime_byHour = {"count": count, "hours": hours}

crime_byHour = pd.DataFrame(crime_byHour)
crime_byHour = crime_byHour.sort_values(by = "hours", ascending = True)

crime_byHour.plot(figsize = (20,10), kind = "line", x = "hours", y = "count",

color = "green", linewidth = 5, legend = False)

plt.ylabel("Number of Domestic Crimes", fontsize = 18)

plt.xlabel("Hour", fontsize = 18)
plt.title("Number of domestic crimes by hour", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)

8. number of domestic crimes by day and hour?

In [51]: temp = df.filter(df["Domestic"] == "true")

temp = temp.select("DayOfWeek", df['HourOfDay'].cast('int').alias('HourOfDay'))

| Sat| 14| 886|
| Tue| 15| 822|
| Fri| 23| 1087|
| Fri| 12| 798|
| Tue| 21| 1095|
| Thu| 14| 715|
| Sun| 13| 809|
| Mon| 22| 1114|
| Tue| 16| 854|
| Sun| 4| 611|
| Fri| 6| 286|
| Thu| 8| 611|
| Wed| 16| 810|
| Tue| 6| 332|
| Mon| 4| 321|
| Thu| 10| 686|
| Mon| 2| 644|
| Thu| 7| 381|
| Thu| 20| 964|
| Tue| 18| 963|
only showing top 20 rows

In [52]: import seaborn as sns

temp = df.filter(df["Domestic"] == "true")
temp = temp.select("DayOfWeek", df['HourOfDay'].cast('int').alias('HourOfDay'))
hourlyCount = temp.groupBy(["DayOfWeek","HourOfDay"]).count().collect()

days = [item[0] for item in hourlyCount]

hours = [item[1] for item in hourlyCount]
count = [item[2] for item in hourlyCount]
crime_byHour = {"count": count, "hours": hours, "days": days}
crime_byHour = pd.DataFrame(crime_byHour)
crime_byHour = crime_byHour.sort_values(by = "hours", ascending = True)
import seaborn as sns
g = sns.FacetGrid(crime_byHour, hue="days", size = 12)
g.map(plt.plot, "hours", "count", linewidth = 3)
plt.ylabel("Number of Domestic Crimes", fontsize = 18)
plt.xlabel("Hour", fontsize = 18)
plt.title("Number of domestic crimes by day and hour", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)

C:\Users\HP\Anaconda3\lib\site-packages\seaborn\axisgrid.py:230: UserWarning: The `size` paramter has been re

named to `height`; please update your code.
warnings.warn(msg, UserWarning)

