Bigdatapedia Crimedataset pysparkML

10/16/2019 Bigdatapedia_crimedataset_pysparkML
Working with crimedataset via pyspark & ML

In [28]: from bigdatapedia import BigDatapedia
BigDatapedia.logo_terminal("BigDatapedia")
____ _ _____ _ _ _
| _ \(_) | __ \ | | | (_)
| |_) |_ __ _| | | | __ _| |_ __ _ _ __ ___ __| |_ __ _
| _ <| |/ _` | | | |/ _` | __/ _` | '_ \ / _ \/ _` | |/ _` |
| |_) | | (_| | |__| | (_| | || (_| | |_) | __/ (_| | | (_| |
|____/|_|\__, |_____/ \__,_|\__\__,_| .__/ \___|\__,_|_|\__,_|
__/ | | |
|___/ |_|
In [29]: import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Pyspark_SampleProject").getOrCreate()
localhost:8889/nbconvert/html/00BPB/Bigdatapedia_crimedataset_pysparkML.ipynb?download=false 1/23
In [30]: from pyspark.sql.types import (StructType, StructField, DateType, BooleanType, DoubleType, IntegerType, Stri
ngType, TimestampType)
crimes_schema = StructType([StructField("ID", StringType(), True),
StructField("CaseNumber", StringType(), True),
StructField("Date", StringType(), True ),
StructField("Block", StringType(), True),
StructField("IUCR", StringType(), True),
StructField("PrimaryType", StringType(), True ),
StructField("Description", StringType(), True ),
StructField("LocationDescription", StringType(), True ),
StructField("Arrest", BooleanType(), True),
StructField("Domestic", BooleanType(), True),
StructField("Beat", StringType(), True),
StructField("District", StringType(), True),
StructField("Ward", StringType(), True),
StructField("CommunityArea", StringType(), True),
StructField("FBICode", StringType(), True ),
StructField("XCoordinate", DoubleType(), True),
StructField("YCoordinate", DoubleType(), True ),
StructField("Year", IntegerType(), True),
StructField("UpdatedOn", DateType(), True ),
StructField("Latitude", DoubleType(), True),
StructField("Longitude", DoubleType(), True),
StructField("Location", StringType(), True )
])
In [31]: crimes = spark.read.csv("file:///D:/00STUDIES/00ANALYTICS/00Hadoop/1UBUNTU_SHARED_Folder/USE_CASES/Crime_dat

a/Crimes_2001_sample.csv",
header = True,inferSchema=True)
In [32]: print(" The crimes dataframe has {} records".format(crimes.count()))
The crimes dataframe has 1048575 records
In [33]: from datetime import datetime

from pyspark.sql.functions import col,udf
myfunc = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S'), TimestampType())
df = crimes.withColumn('Date_time', myfunc(col('Date'))).drop("Date")
df.select(df["Date_time"]).show(5)
+-------------------+
| Date_time|
+-------------------+
|2018-08-01 23:21:00|
|2001-01-01 11:00:00|
|2017-11-23 15:14:00|
|2017-11-28 21:43:00|
|2017-11-12 19:15:00|
+-------------------+
only showing top 5 rows
In [34]: # df.printSchema()
In [35]: # Calculate statistics of numeric and string columns

crimes.select(["Latitude","Longitude","Year","X_Coordinate","Y_Coordinate"]).describe().show()
+-------+-------------------+-------------------+------------------+------------------+------------------+
|summary| Latitude| Longitude| Year| X_Coordinate| Y_Coordinate|
+-------+-------------------+-------------------+------------------+------------------+------------------+
| count| 994634| 994634| 1048575| 994634| 994634|
| mean| 41.843625077813975| -87.67166759336945|2004.0785122666475| 1164546.959224197|1886308.3875787475|
| stddev|0.09283909948294168|0.06428058629862175| 4.631923885824727|17835.072111265592|33725.190122034575|
| min| 36.6194464| -91.68656568| 2001| 0| 0|
| max| 42.02291033| -87.52452938| 2018| 1205119| 1951622|
+-------+-------------------+-------------------+------------------+------------------+------------------+
In [36]: from pyspark.sql.functions import format_number

result = crimes.select(["Latitude","Longitude","Year","X_Coordinate","Y_Coordinate"]).describe()
result.select(result['summary'],
format_number(result['Latitude'].cast('float'),2).alias('Latitude'),
format_number(result['Longitude'].cast('float'),2).alias('Longitude'),
result['Year'].cast('int').alias('year'),
format_number(result['X_Coordinate'].cast('float'),2).alias('X_Coordinate'),
format_number(result['Y_Coordinate'].cast('float'),2).alias('Y_Coordinate')
).show()
+-------+----------+----------+-------+------------+------------+
|summary| Latitude| Longitude| year|X_Coordinate|Y_Coordinate|
+-------+----------+----------+-------+------------+------------+
| count|994,634.00|994,634.00|1048575| 994,634.00| 994,634.00|
| mean| 41.84| -87.67| 2004|1,164,547.00|1,886,308.38|
| stddev| 0.09| 0.06| 4| 17,835.07| 33,725.19|
| min| 36.62| -91.69| 2001| 0.00| 0.00|
| max| 42.02| -87.52| 2018|1,205,119.00|1,951,622.00|
+-------+----------+----------+-------+------------+------------+
In [37]: # How many primary crime types are there?

crimes.select("Primary_Type").distinct().count()
Out[37]: 34
Usecases
1. How many homicides are there in the dataset?
In [53]: crimes.where(crimes["Primary_Type"] == "HOMICIDE").count()
Out[53]: 8219
2. how many domestic assualts there are?
In [39]: crimes.filter((crimes["Primary_Type"] == "ASSAULT") & (crimes["Domestic"] == "True")).count()
Out[39]: 14578
In [40]: # Create a new column with withColumn

lat_max = crimes.agg({"Latitude" : "max"}).collect()[0][0]
print("The maximum latitude values is {}".format(lat_max))
The maximum latitude values is 42.02291033
In [41]: from pyspark.sql.functions import max,min

df.select(max("X_Coordinate"),min("X_Coordinate")).show()
from pyspark.sql.functions import mean

df.select(mean("Latitude").alias("Mean_Latitude")).show()
+-----------------+-----------------+
|max(X_Coordinate)|min(X_Coordinate)|
+-----------------+-----------------+
| 1205119| 0|
+-----------------+-----------------+
+------------------+
| Mean_Latitude|
+------------------+
|41.843625077813975|
+------------------+
3. Find the number of crimes per year
In [42]: df.groupBy("Year").count().orderBy("Year").show()
# df.groupBy("Year").count().collect()
+----+------+
|Year| count|
+----+------+
|2001|481933|
|2002|148966|
|2003| 1211|
|2004| 1797|
|2005|164029|
|2006|103473|
|2007| 4025|
|2008| 5569|
|2009| 46306|
|2010| 510|
|2011| 512|
|2012| 595|
|2013| 545|
|2014| 1247|
|2015| 13042|
|2016| 2605|
|2017| 24029|
|2018| 48181|
+----+------+
4. Plot number of crimes by month
In [43]: import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
from pyspark.sql.functions import month
monthdf = df.withColumn("Month",month("Date_time"))
monthCounts = monthdf.select("Month").groupBy("Month").count()
monthCounts = monthCounts.collect()
months = [item[0] for item in monthCounts]
count = [item[1] for item in monthCounts]
crimes_per_month = {"month":months, "crime_count": count}
crimes_per_month = pd.DataFrame(crimes_per_month)
crimes_per_month = crimes_per_month.sort_values(by = "month")
crimes_per_month.plot(figsize = (20,10), kind = "line", x = "month", y = "crime_count",
color = "red", linewidth = 8, legend = False)
plt.xlabel("Month", fontsize = 18)
plt.ylabel("Number of Crimes", fontsize = 18)
plt.title("Number of Crimes Per Month", fontsize = 28)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.show()
In [44]: from pyspark.sql.functions import month
monthdf = df.withColumn("Month",month("Date_time"))
monthdf.select("Year", "Month").groupBy("Year", "Month").count().orderBy("Year", "Month").show()
+----+-----+-----+
|Year|Month|count|
+----+-----+-----+
|2001| 1|41523|
|2001| 2|34969|
|2001| 3|40334|
|2001| 4|39639|
|2001| 5|40709|
|2001| 6|40842|
|2001| 7|43494|
|2001| 8|42569|
|2001| 9|40255|
|2001| 10|41769|
|2001| 11|38744|
|2001| 12|37086|
|2002| 1|23235|
|2002| 2|19681|
|2002| 3|22772|
|2002| 4|22632|
|2002| 5|24733|
|2002| 6| 5278|
|2002| 7| 5184|
|2002| 8| 5370|
+----+-----+-----+
5. Where do most crimes take pace?
In [45]: from pyspark.sql.functions import col

crimes.groupBy("Location_Description").count().orderBy(col("count").desc()).show()
+--------------------+------+
|Location_Description| count|
+--------------------+------+
| STREET|303380|
| RESIDENCE|191271|
| APARTMENT| 82113|
| SIDEWALK| 78890|
| OTHER| 42807|
|PARKING LOT/GARAG...| 31730|
|SCHOOL, PUBLIC, B...| 23336|
| ALLEY| 23326|
| RESIDENCE-GARAGE| 21308|
|RESIDENCE PORCH/H...| 18521|
| SMALL RETAIL STORE| 17537|
| GROCERY FOOD STORE| 16556|
| RESTAURANT| 15755|
|VEHICLE NON-COMME...| 15481|
|CHA PARKING LOT/G...| 14915|
| DEPARTMENT STORE| 13216|
| GAS STATION| 10874|
|COMMERCIAL / BUSI...| 9875|
| CHA APARTMENT| 9581|
| PARK PROPERTY| 7381|
+--------------------+------+
In [46]: crime_location = crimes.groupBy("Location_Description").count().collect()

location = [item[0] for item in crime_location]
count = [item[1] for item in crime_location]
crime_location = {"location" : location, "count": count}
crime_location = pd.DataFrame(crime_location)
crime_location = crime_location.sort_values(by = "count", ascending = False)
crime_location = crime_location.iloc[:20]
myplot = crime_location .plot(figsize = (20,20), kind = "barh", color = "#b35900", width = 0.8,
x = "location", y = "count", legend = False)
myplot.invert_yaxis()
plt.xlabel("Number of crimes", fontsize = 28)
plt.ylabel("Crime Location", fontsize = 28)
plt.title("Number of Crimes By Location", fontsize = 36)
plt.show()
6.Which days have the highest number of crimes?
In [47]: from pyspark.sql.functions import date_format

df = df.withColumn("DayOfWeek", date_format("Date_time","E")).\
withColumn("DayOfWeek_number", date_format("Date_time","u")).\
withColumn("HourOfDay", date_format("Date_time","H"))
df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().show()
+---------+----------------+------+
|DayOfWeek|DayOfWeek_number| count|
+---------+----------------+------+
| Fri| 5|151654|
| Wed| 3|151630|
| Sat| 6|150399|
| Tue| 2|151330|
| Mon| 1|151550|
| Sun| 7|142815|
| Thu| 4|149197|
+---------+----------------+------+
In [48]: weekDaysCount = df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().collect()

days = [item[0] for item in weekDaysCount]
count = [item[2] for item in weekDaysCount]
day_number = [item[1] for item in weekDaysCount]
crime_byDay = {"days" : days, "count": count, "day_number": day_number}
crime_byDay = pd.DataFrame(crime_byDay)
crime_byDay = crime_byDay.sort_values(by = "day_number", ascending = True)
crime_byDay.plot(figsize = (20,10), kind = "line", x = "days", y = "count",

color = "red", linewidth = 8, legend = False)
plt.ylabel("Number of Crimes", fontsize = 18)

plt.xlabel("")
plt.title("Number of Crimes by Day", fontsize = 28)
plt.show()
7. Number of domestic crimes by hour
In [49]: temp = df.filter(df["Domestic"] == "true")

temp = temp.select(df['HourOfDay'].cast('int').alias('HourOfDay'))
temp.groupBy(["HourOfDay"]).count().show()
+---------+-----+
|HourOfDay|count|
+---------+-----+
| 12| 5955|
| 22| 7761|
| 1| 5779|
| 13| 5198|
| 16| 6046|
| 6| 2271|
| 3| 3915|
| 20| 7250|
| 5| 2365|
| 19| 6874|
| 15| 5813|
| 17| 6228|
| 9| 4887|
| 4| 2967|
| 8| 4172|
| 23| 7361|
| 7| 2984|
| 10| 5210|
| 21| 7689|
| 11| 5118|
+---------+-----+

temp = temp.select(df['HourOfDay'].cast('int').alias('HourOfDay'))
hourlyCount = temp.groupBy(["HourOfDay"]).count().collect()
hours = [item[0] for item in hourlyCount]

count = [item[1] for item in hourlyCount]
crime_byHour = {"count": count, "hours": hours}

crime_byHour = pd.DataFrame(crime_byHour)
crime_byHour = crime_byHour.sort_values(by = "hours", ascending = True)
crime_byHour.plot(figsize = (20,10), kind = "line", x = "hours", y = "count",

color = "green", linewidth = 5, legend = False)
plt.ylabel("Number of Domestic Crimes", fontsize = 18)

plt.xlabel("Hour", fontsize = 18)
plt.title("Number of domestic crimes by hour", fontsize = 28)
plt.show()
8. number of domestic crimes by day and hour?

temp = temp.select("DayOfWeek", df['HourOfDay'].cast('int').alias('HourOfDay'))
temp.groupBy(["DayOfWeek","HourOfDay"]).count().show()
+---------+---------+-----+
|DayOfWeek|HourOfDay|count|
+---------+---------+-----+
| Sat| 14| 886|
| Tue| 15| 822|
| Fri| 23| 1087|
| Fri| 12| 798|
| Tue| 21| 1095|
| Thu| 14| 715|
| Sun| 13| 809|
| Mon| 22| 1114|
| Tue| 16| 854|
| Sun| 4| 611|
| Fri| 6| 286|
| Thu| 8| 611|
| Wed| 16| 810|
| Tue| 6| 332|
| Mon| 4| 321|
| Thu| 10| 686|
| Mon| 2| 644|
| Thu| 7| 381|
| Thu| 20| 964|
| Tue| 18| 963|
+---------+---------+-----+
In [52]: import seaborn as sns

temp = df.filter(df["Domestic"] == "true")
temp = temp.select("DayOfWeek", df['HourOfDay'].cast('int').alias('HourOfDay'))
hourlyCount = temp.groupBy(["DayOfWeek","HourOfDay"]).count().collect()
days = [item[0] for item in hourlyCount]

hours = [item[1] for item in hourlyCount]
count = [item[2] for item in hourlyCount]
crime_byHour = {"count": count, "hours": hours, "days": days}
crime_byHour = pd.DataFrame(crime_byHour)
crime_byHour = crime_byHour.sort_values(by = "hours", ascending = True)
import seaborn as sns
g = sns.FacetGrid(crime_byHour, hue="days", size = 12)
g.map(plt.plot, "hours", "count", linewidth = 3)
g.add_legend()
plt.ylabel("Number of Domestic Crimes", fontsize = 18)
plt.xlabel("Hour", fontsize = 18)
plt.title("Number of domestic crimes by day and hour", fontsize = 28)
plt.show()
C:\Users\HP\Anaconda3\lib\site-packages\seaborn\axisgrid.py:230: UserWarning: The `size` paramter has been re

named to `height`; please update your code.
warnings.warn(msg, UserWarning)
In [ ]:

Bigdatapedia Crimedataset pysparkML

Загружено:

Сведения о документе

Исходное описание:

Оригинальное название

Авторское право

Доступные форматы

Поделиться этим документом

Поделиться или встроить документ

Параметры публикации

Этот документ был вам полезен?

Это неприемлемый материал?

Авторское право:

Доступные форматы

Bigdatapedia Crimedataset pysparkML

Загружено:

Авторское право:

Доступные форматы

10/16/2019 Bigdatapedia_crimedataset_pysparkML

Working with crimedataset via pyspark & ML

In [29]: import pyspark

In [31]: crimes = spark.read.csv("file:///D:/00STUDIES/00ANALYTICS/00Hadoop/1UBUNTU_SHARED_Folder/USE_CASES/Crime_dat

In [32]: print(" The crimes dataframe has {} records".format(crimes.count()))

The crimes dataframe has 1048575 records

In [33]: from datetime import datetime

In [35]: # Calculate statistics of numeric and string columns

In [36]: from pyspark.sql.functions import format_number

In [37]: # How many primary crime types are there?

1. How many homicides are there in the dataset?

In [53]: crimes.where(crimes["Primary_Type"] == "HOMICIDE").count()

2. how many domestic assualts there are?

In [39]: crimes.filter((crimes["Primary_Type"] == "ASSAULT") & (crimes["Domestic"] == "True")).count()

In [40]: # Create a new column with withColumn

The maximum latitude values is 42.02291033

In [41]: from pyspark.sql.functions import max,min

from pyspark.sql.functions import mean

3. Find the number of crimes per year

4. Plot number of crimes by month

In [43]: import pandas as pd

In [44]: from pyspark.sql.functions import month

5. Where do most crimes take pace?

In [45]: from pyspark.sql.functions import col

In [46]: crime_location = crimes.groupBy("Location_Description").count().collect()

6.Which days have the highest number of crimes?

In [47]: from pyspark.sql.functions import date_format

In [48]: weekDaysCount = df.groupBy(["DayOfWeek", "DayOfWeek_number"]).count().collect()

crime_byDay.plot(figsize = (20,10), kind = "line", x = "days", y = "count",

plt.ylabel("Number of Crimes", fontsize = 18)

7. Number of domestic crimes by hour

In [49]: temp = df.filter(df["Domestic"] == "true")

In [50]: temp = df.filter(df["Domestic"] == "true")

hours = [item[0] for item in hourlyCount]

crime_byHour = {"count": count, "hours": hours}

crime_byHour.plot(figsize = (20,10), kind = "line", x = "hours", y = "count",

plt.ylabel("Number of Domestic Crimes", fontsize = 18)

8. number of domestic crimes by day and hour?

In [51]: temp = df.filter(df["Domestic"] == "true")

In [52]: import seaborn as sns

days = [item[0] for item in hourlyCount]

C:\Users\HP\Anaconda3\lib\site-packages\seaborn\axisgrid.py:230: UserWarning: The `size` paramter has been re

Вам также может понравиться