[SQL]: cloning Instagram Database
SQL
09/30/2019
Common entities on Instagram: USERS, PHOTOS, COMMENTS, HASHTAGS, LIKES, FOLLOWERS/FOLLOWEES, ...
Tables
CREATE TABLE users ( id INT NOT NULL PRIMARY KEY AUTO_INCREMENT, username VARCHAR(255) NOT NULL UNIQUE, created_at TIMESTAMP DEFAULT NOW());
CREATE TABLE photos ( id INT NOT NULL PRIMARY KEY AUTO_INCREMENT, image_url VARCHAR(255) NOT NULL, user_id INT NOT NULL, created_at TIMESTAMP DEFAULT NOW(), FOREIGN KEY (user_id) REFERENCES users(id));CREATE TABLE comments ( id INT NOT NULL PRIMARY KEY AUTO_INCREMENT, comment_text VARCHAR(255), photo_id INT NOT NULL, user_id INT NOT NULL, created_at TIMESTAMP DEFAULT NOW(), FOREIGN KEY (user_id) REFERENCES users(id), FOREIGN KEY (photo_id) REFERENCES photos(id));CREATE TABLE likes ( user_id INT NOT NULL, photo_id INT NOT NULL, created_at TIMESTAMP DEFAULT NOW(), FOREIGN KEY(user_id) REFERENCES users(id), FOREIGN KEY(photo_id) REFERENCES photos(id), PRIMARY KEY(user_id, photo_id));CREATE TABLE follows ( follower_id INT NOT NULL, followee_id INT NOT NULL, created_at TIMESTAMP DEFAULT NOW(), FOREIGN KEY(follower_id) REFERENCES users(id), FOREIGN KEY(followee_id) REFERENCES users(id), PRIMARY KEY(follower_id, followee_id));
--CREATE TABLE tags ( id INT NOT NULL PRIMARY KEY AUTO_INCREMENT, tag_name VARCHAR(255) UNIQUE, created_at TIMESTAMP DEFAULT NOW());CREATE TABLE photo_tags ( photo_id INT NOT NULL, tag_id INT NOT NULL, FOREIGN KEY(photo_id) REFERENCES photos(id), FOREIGN KEY(tag_id) REFERENCES tags(id), PRIMARY KEY(photo_id, tag_id));
It's possible to make username a primary key, but it may be slow when string is too long. Better practice to still include the id
PRIMARY KEY(user_id, photo_id) in likes prevent from a user to make a multiple likes
PRIMARY KEY(follower_id, followee_id)
to prevent following oneself- THIS actually does not prevent! Try using trigger to prevent this.
PRIMARY KEY(photo_id, tag_id) in photo_tags table prevents multiple instances of photo tag
Hash tag implementation
1. Creating tag column in photos
+----+----------+--------------+---------------------------------+| id | url | caption | tags |+----+----------+--------------+---------------------------------+| 1 | /dociv32 | Madison | #capitol#summer#omg#daily || 2 | /39dksje | My dog | #dog#frese#joy#happy || 3 | /kfeiw2 | selfie today | #selfietoday#happy#sad#handsome |+----+----------+--------------+---------------------------------+
Easy to implement. However, this design..
- Limits # tags that can be stored
- Each tag can't have additional information stored
- Searching for tags can be a hassle
2. Using 2 tables
photos:+----+----------+--------------+| id | url | caption |+----+----------+--------------+| 1 | /dociv32 | Madison || 2 | /39dksje | My dog || 3 | /kfeiw2 | selfie today |+----+----------+--------------+
tags:+-----------+----------+| tag_name | photo_id |+-----------+----------+| #happy | 2 || #happy | 3 || #TGIF | 2 || #TBT | 1 || #TBT | 3 |+-----------+----------+
Unlimited # of tags, but this is slower than first solution because it has duplicated strings over and over.
3. Using 3 tables (preferred)
photos:+----+----------+--------------+| id | url | caption |+----+----------+--------------+| 1 | /dociv32 | Madison || 2 | /39dksje | My dog || 3 | /kfeiw2 | selfie today |+----+----------+--------------+
tags:+----+-----------+| id | tag_name |+----+-----------+| 1 | #happy || 2 | #handsome || 3 | #TGIF || 4 | #TBT || 5 | #Daily || 6 | #FTW |+----+-----------+
photo_tags:+----------+--------+| photo_id | tag_id |+----------+--------+| 1 | 2 || 1 | 1 || 3 | 2 || 2 | 3 || 2 | 6 || 3 | 1 || 3 | 4 || 3 | 5 |+----------+--------+
Unlimited # of tags and can add additional information
However, this requires more work when inserting/updating and have to worry about orphans; when deleting photos also need to remove all associated information
Read more about schema performance test
Work with data
Insert data from file.sql (currently not available)
Find the 5 oldest users
SELECT username, created_atFROM usersORDER BY created_at LIMIT 5;
+------------------+---------------------+| username | created_at |+------------------+---------------------+| Darby_Herzog | 2016-05-06 00:14:21 || Emilio_Bernier52 | 2016-05-06 13:04:30 || Elenor88 | 2016-05-08 01:30:41 || Nicole71 | 2016-05-09 17:30:22 || Jordyn.Jacobson2 | 2016-05-14 07:56:26 |+------------------+---------------------+
What day do most users sign up on
- Get day name from created_at
SELECT username, created_at, DAYNAME(created_at) AS dayFROM users;
- Get DAYNAME & COUNT with GROUP BY
SELECT DAYNAME(created_at) AS day, COUNT(DAYNAME(created_at)) AS cntFROM usersGROUP BY day;
+-----------+-----+| day | cnt |+-----------+-----+| Friday | 15 || Monday | 14 || Saturday | 12 || Sunday | 16 || Thursday | 16 || Tuesday | 14 || Wednesday | 13 |+-----------+-----+
- LIMIT 1 (or 2)
SELECT DAYNAME(created_at) AS dayFROM usersGROUP BY dayORDER BY COUNT(DAYNAME(created_at)) DESCLIMIT 1;
+----------+| day |+----------+| Thursday |+----------+
Find users who's never posted
- users Left Join photos
SELECT username, image_urlFROM users LEFT JOIN photos ON users.id = photos.user_id;
- Condition statement where image_url = NULL
SELECT username, image_urlFROM usersLEFT JOIN photos ON users.id = photos.user_idWHERE photos.id IS NULL;
+---------------------+-----------+| username | image_url |+---------------------+-----------+| Aniya_Hackett | NULL || Bartholome.Bernhard | NULL || Bethany20 | NULL || Darby_Herzog | NULL || David.Osinski47 | NULL || Duane60 | NULL || Esmeralda.Mraz57 | NULL || Esther.Zulauf61 | NULL || Franco_Keebler64 | NULL || Hulda.Macejkovic | NULL || Jaclyn81 | NULL || Janelle.Nikolaus81 | NULL || Jessyca_West | NULL || Julien_Schmidt | NULL || Kasandra_Homenick | NULL || Leslie67 | NULL || Linnea59 | NULL || Maxwell.Halvorson | NULL || Mckenna17 | NULL || Mike.Auer39 | NULL || Morgan.Kassulke | NULL || Nia_Haag | NULL || Ollie_Ledner37 | NULL || Pearl7 | NULL || Rocio33 | NULL || Tierra.Trantow | NULL |+---------------------+-----------+
Find who got the most likes on a photo
- Get # likes for each photo
SELECT photo_id, COUNT(likes.photo_id)FROM likesGROUP BY photo_id;
- ORDER BY & LIMIT
SELECT photo_id, COUNT(likes.photo_id) as likesFROM likesGROUP BY photo_idORDER BY likes DESC LIMIT 1;
+----------+-------+| photo_id | likes |+----------+-------+| 145 | 48 |+----------+-------+
- Get username who posted photo_id=145
SELECT username, likes.photo_id, COUNT(likes.photo_id) AS likesFROM usersJOIN photos ON photos.user_id = users.idJOIN likes ON likes.photo_id = photos.idGROUP BY likes.photo_idORDER BY likes DESC LIMIT 1;
+---------------+----------+-------+| username | photo_id | likes |+---------------+----------+-------+| Zack_Kemmer93 | 145 | 48 |+---------------+----------+-------+
How many times does average user post
- Total # photos, Total # user
SELECT COUNT(photo.id) FROM photos;SELECT COUNT(users.id) FROM users;
+-----------+| COUNT(id) |+-----------+| 257 |+-----------++-----------------+| COUNT(users.id) |+-----------------+| 100 |+-----------------+
- Divide
SELECT(SELECT COUNT(photos.id) FROM photos) /(SELECT COUNT(users.id) FROM users) AS res;
+--------+| res |+--------+| 2.5700 |+--------+
Use sub-query without JOIN
Get 5 most commonly used hash tags
- Get number of tags
SELECT COUNT(photo_tags.tag_id) num_tagsFROM photo_tagsGROUP BY photo_tags.tag_idORDER BY num_tags DESC LIMIT 5;
+----------+| num_tags |+----------+| 59 || 42 || 39 || 38 || 24 |+----------+
- Join to get name of tag
SELECT tag_name, COUNT(photo_tags.tag_id) num_tagsFROM photo_tags JOIN tags on tags.id = photo_tags.tag_idGROUP BY photo_tags.tag_idORDER BY num_tags DESC LIMIT 5;
+----------+----------+| tag_name | num_tags |+----------+----------+| smile | 59 || beach | 42 || party | 39 || fun | 38 || concert | 24 |+----------+----------+
Get users who have liked every single photo
- Take a look at the likes table
$ DESC likes;+------------+-----------+------+-----+-------------------+-------+| Field | Type | Null | Key | Default | Extra |+------------+-----------+------+-----+-------------------+-------+| user_id | int(11) | NO | PRI | NULL | || photo_id | int(11) | NO | PRI | NULL | || created_at | timestamp | NO | | CURRENT_TIMESTAMP | |+------------+-----------+------+-----+-------------------+-------+
- Get number of likes for each user
SELECT username, COUNT(*) as num_likesFROM users JOIN likes ON users.id = likes.user_idGROUP BY users.id;
- Get #photos
SELECT COUNT(photos.id) AS num_photos FROM photos;
+------------+| num_photos |+------------+| 257 |+------------+
- Select users with HAVING #likes = #photos
SELECT username, COUNT(*) as num_likesFROM users JOIN likes ON users.id = likes.user_idGROUP BY users.idHAVING num_likes = ( SELECT COUNT(photos.id) FROM photos );
- Solution w/o using sub-query
SELECT username, COUNT(*) as num_likesFROM users JOIN likes ON users.id = likes.user_idGROUP BY users.idHAVING num_likes = MAX(likes.photo_id);