In this second part of the migration of my Ghost blog to hugo, I’m going to show you how I automatically migrated my 38 CTF writeups.
I started by exporting my Ghost blog to JSON. I then studied the JSON structure to find out where the posts were located. I determined the following structure:
"db": [
{
"data": {
[...]
"posts": [
{
"id": "620c06d22ddec50001a0d647",
"uuid": "30679c97-4b80-41ac-aae0-91ea74ee85ca",
"title": "Writeup - Pandora (HTB)",
"slug": "writeup-pandora-htb",
"mobiledoc": "...",
"html": "...",
"comment_id": "620c06d22ddec50001a0d647",
"plaintext": "...",
"feature_image": "__GHOST_URL__/content/images/2022/03/Pandora.png",
"featured": 0,
"type": "post",
"status": "published",
[...]
},
[...]
]
[...]
}
}
]
In order to iterate over the different articles, all I need to do is make the following loop:
import json
with open("export.json") as file:
export = json.load(file)
for i in export["db"][0]["data"]["posts"]:
if "writeup" in i["slug"] and i["status"] == "published":
# [...]
Note that I added a condition to keep only articles containing writeup
in the slug and with a published
status.
Now that I have the list of different items, I can convert them for Hugo.
Directory structure creation #
The first step in converting articles is to create the directory structure. Hugo uses the following folder structure for articles:
folder/
article_1/
img/
featured.png
index.md
article_1/
[...]
[...]
In the for loop created earlier, I use the following line to create the folder for each item:
import os
for i in export["db"][0]["data"]["posts"]:
[...]
os.makedirs(os.path.join("output/", i["slug"], "img"), exist_ok=True)
[...]
Article conversion #
I’m now going to create and fill in the
Convert HTML to Markdown
index.md
. To do this, I’ll use the html
field in the Ghost export and a python library that converts HTML to Markdown:
Still in the same for loop, I open the index.md
file in the folder corresponding to the article. I write the header and then use the md()
function to convert HTML to Markdown:
from markdownify import MarkdownConverter
for i in export["db"][0]["data"]["posts"]:
[...]
header = f"""---
title: "{i["title"]}"
date: {i["published_at"].split("T")[0]}
slug: "{i["slug"]}"
type: "writeup-ctf"
--- \n\n"""
with open(os.path.join("output/", i["slug"], "index.md"), "w") as index:
index.write(header)
content = md(
i["html"], heading_style="ATX", code_language="bash", bullets="-"
)
index.write(content)
[...]
The markdownify library has a large number of options, and I’ve used the following:
heading_style
- Defines how headings should be converted.code_language
- Defines the language that should be assumed for all<pre>
sectionsbullets
- An iterable (string, list, or tuple) of bullet styles to be used.
Download images #
To complete the import, one important element is missing: the images. Since I’ll be downloading files from several places in the code, I’ve created a function that I’ll be able to reuse:
from pathlib import Path
import requests
def download_file(url: str, dst: Path) -> None:
data = requests.get(url).content
with open(dst, "wb") as img:
img.write(data)
This function takes a URL and a destion for the final file.
Featured image #
To download the featured image, I first create a variable containing either None
if there is no feature_image for this item, or the image address. Since the export contains __GHOST_URL__
instead of my blog link, I use the replace()
function to make the change. If there’s a feature image, I use the previously created download_file()
function to download it:
import os
for i in export["db"][0]["data"]["posts"]:
[...]
feature_link = (
i["feature_image"].replace("__GHOST_URL__", https://blog.d3vyce.fr)
if i["feature_image"]
else None
)
if feature_link:
download_file(
feature_link,
os.path.join("output/", i["slug"], "featured.png"),
)
[...]
Article image #
For images in articles, it’s a little more complex. I have to find the images, download them and rename them with an incrementing number. Finaly I can integrate the image tags into the final Markdown.
To do so, I use a custom class which is called in the md()
function. As for featured_image, this class lets me replace the __GHOST_URL__
, then download the images into a temporary folder. In the download_img()
function, I assign a name to the image according to the files already in the folder, so that the names increment (image-1.png, image-2.png, …).
Finally, after making all the images in the article, I move all the files in the temporary folder to the final folder with shutil.move()
.
import os
import shutil
import re
from markdownify import MarkdownConverter
def download_img(url: str) -> Path:
files = os.listdir("tmp/")
id = [int(re.search(r"\-(.*?)\.", i).group(1)) for i in files]
file_name = "image-" + str(max(id) + 1) + ".png" if files else "image-1.png"
download_file(url, os.path.join("tmp/", file_name))
return os.path.join("img/", file_name.replace(".png", ".webp"))
class ImageBlockConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
data = (
super()
.convert_img(el, text, convert_as_inline)
.replace("__GHOST_URL__", GHOST_URL)
+ "\n\n"
)
if GHOST_URL in data:
img_url = re.search(r"\((.*?)\)", data).group(1)
return data.replace(img_url, download_img(img_url))
def md(html, **options):
return ImageBlockConverter(**options).convert(html)
for i in export["db"][0]["data"]["posts"]:
[...]
for file in os.listdir("tmp/"):
shutil.move(
os.path.join("tmp/", file),
os.path.join("output/", i["slug"], "img", file),
)
[...]
Conclusion #
Thanks to this script, 90% of the work was done, even if I went back over the various articles to correct 2/3 errors and check that no element was missing. For those interested, here’s the full code. It’s highly customized and requires a few modifications if you want to use it.
Code final #
import json
import os
import shutil
import re
from pathlib import Path
import requests
from markdownify import MarkdownConverter
GHOST_URL = "https://blog.d3vyce.fr"
EXPORT_FILE = "export.json"
OUTPUT_DIR = "output/"
TMP_DIR = "tmp/"
def download_file(url: str, dst: Path) -> None:
data = requests.get(url).content
with open(dst, "wb") as img:
img.write(data)
def download_img(url: str) -> Path:
files = os.listdir(TMP_DIR)
id = [int(re.search(r"\-(.*?)\.", i).group(1)) for i in files]
file_name = "image-" + str(max(id) + 1) + ".png" if files else "image-1.png"
download_file(url, os.path.join(TMP_DIR, file_name))
return os.path.join("img/", file_name.replace(".png", ".webp"))
class ImageBlockConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
data = (
super()
.convert_img(el, text, convert_as_inline)
.replace("__GHOST_URL__", GHOST_URL)
+ "\n\n"
)
if GHOST_URL in data:
img_url = re.search(r"\((.*?)\)", data).group(1)
return data.replace(img_url, download_img(img_url))
def md(html, **options):
return ImageBlockConverter(**options).convert(html)
def main():
with open(EXPORT_FILE) as file:
export = json.load(file)
for i in export["db"][0]["data"]["posts"]:
if "writeup" in i["slug"] and i["status"] == "published":
feature_link = (
i["feature_image"].replace("__GHOST_URL__", GHOST_URL)
if i["feature_image"]
else None
)
header = f"""---
title: "{i["title"]}"
date: {i["published_at"].split("T")[0]}
slug: "{i["slug"]}"
type: "writeup-ctf"
--- \n\n"""
# Create Directorys
os.makedirs(os.path.join(OUTPUT_DIR, i["slug"], "img"), exist_ok=True)
os.makedirs(TMP_DIR, exist_ok=True)
# Create index.md
with open(os.path.join(OUTPUT_DIR, i["slug"], "index.md"), "w") as index:
index.write(header)
content = md(
i["html"], heading_style="ATX", code_language="bash", bullets="-"
)
index.write(content)
for file in os.listdir(TMP_DIR):
shutil.move(
os.path.join(TMP_DIR, file),
os.path.join(OUTPUT_DIR, i["slug"], "img", file),
)
# Download featured img
if feature_link:
download_file(
feature_link, os.path.join(OUTPUT_DIR, i["slug"], "featured.png")
)