Skip to content

Commit

Permalink
feat: add download markdown support (#38)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicoxiang authored May 31, 2022
1 parent 26d3af5 commit 8c5ef03
Show file tree
Hide file tree
Showing 14 changed files with 514 additions and 201 deletions.
23 changes: 13 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# geektime-downloader

geektime-downloader 支持下载专栏为 PDF 文档和下载视频课。
geektime-downloader 支持下载专栏为 PDF/Markdown 文档和下载视频课。

[![go report card](https://goreportcard.com/badge/github.com/nicoxiang/geektime-downloader "go report card")](https://goreportcard.com/report/github.com/nicoxiang/geektime-downloader)
[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)
Expand Down Expand Up @@ -49,13 +49,14 @@ Usage:
geektime-downloader [flags]

Flags:
--comments 是否需要专栏的第一页评论 (default true)
-f, --folder string 专栏和视频课的下载目标位置 (default "")
--gcess string 极客时间 cookie 值 gcess
--gcid string 极客时间 cookie 值 gcid
-h, --help help for geektime-downloader
-u, --phone string 你的极客时间账号(手机号)
-q, --quality string 下载视频清晰度(ld标清,sd高清,hd超清) (default "sd")
--columnOutputType int8 下载专栏的输出格式(1pdf,2markdown,3all) (default 1)
--comments 是否需要专栏的第一页评论 (default true)
-f, --folder string 专栏和视频课的下载目标位置 (default "")
--gcess string 极客时间 cookie 值 gcess
--gcid string 极客时间 cookie 值 gcid
-h, --help help for geektime-downloader
-u, --phone string 你的极客时间账号(手机号)
-q, --quality string 下载视频清晰度(ld标清,sd高清,hd超清) (default "sd")
```

## Note
Expand All @@ -64,9 +65,11 @@ Flags:

2. Ctrl + C 退出程序

3. 如果选择下载所有后中断程序,可重新进入程序继续下载
3. 默认情况下载专栏的输出格式只有 PDF,可以按需选择是否需要下载 Markdown 格式(--columnOutputType 参数),Markdown 格式虽然显示效果上不及 PDF,但优势为可以显示完整的代码块(PDF 代码块在水平方向太长时会有缺失)并保留了原文中的超链接。

4. 通过密码登录的情况下,为了避免多次登录账户,会在目录 [UserConfigDir](https://pkg.go.dev/os#UserConfigDir)/geektime-downloader 下存放用户的登录 cookie,如果不是在自己的电脑上执行,请在使用完毕程序后手动删除
4. 如果选择下载所有后中断程序,可重新进入程序继续下载

5. 通过密码登录的情况下,为了避免多次登录账户,会在目录 [UserConfigDir](https://pkg.go.dev/os#UserConfigDir)/geektime-downloader 下存放用户的登录 cookie,如果不是在自己的电脑上执行,请在使用完毕程序后手动删除

## Inspired by

Expand Down
182 changes: 120 additions & 62 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"io/ioutil"
"math"
"math/rand"
"net/http"
Expand All @@ -18,20 +19,16 @@ import (
"github.com/briandowns/spinner"
"github.com/chromedp/chromedp"
"github.com/manifoldco/promptui"
"github.com/nicoxiang/geektime-downloader/internal/config"
"github.com/nicoxiang/geektime-downloader/internal/geektime"
"github.com/nicoxiang/geektime-downloader/internal/markdown"
"github.com/nicoxiang/geektime-downloader/internal/pdf"
"github.com/nicoxiang/geektime-downloader/internal/pkg/file"
"github.com/nicoxiang/geektime-downloader/internal/pkg/filenamify"
pgt "github.com/nicoxiang/geektime-downloader/internal/pkg/geektime"
"github.com/nicoxiang/geektime-downloader/internal/video"
"github.com/spf13/cobra"
)

// File extension
const (
PDFExtension = ".pdf"
TSExtension = ".ts"
)

var (
phone string
gcid string
Expand All @@ -43,21 +40,21 @@ var (
currentProductIndex int
quality string
downloadComments bool
columnOutputType int8
)

func init() {
rand.Seed(time.Now().UnixNano())

userHomeDir, _ := os.UserHomeDir()
concurrency = int(math.Ceil(float64(runtime.NumCPU()) / 2.0))
defaultDownloadFolder := filepath.Join(userHomeDir, file.GeektimeDownloaderFolder)
defaultDownloadFolder := filepath.Join(userHomeDir, config.GeektimeDownloaderFolder)

rootCmd.Flags().StringVarP(&phone, "phone", "u", "", "你的极客时间账号(手机号)")
rootCmd.Flags().StringVar(&gcid, "gcid", "", "极客时间 cookie 值 gcid")
rootCmd.Flags().StringVar(&gcess, "gcess", "", "极客时间 cookie 值 gcess")
rootCmd.Flags().StringVarP(&downloadFolder, "folder", "f", defaultDownloadFolder, "专栏和视频课的下载目标位置")
rootCmd.Flags().StringVarP(&quality, "quality", "q", "sd", "下载视频清晰度(ld标清,sd高清,hd超清)")
rootCmd.Flags().BoolVar(&downloadComments, "comments", true, "是否需要专栏的第一页评论")
rootCmd.Flags().Int8Var(&columnOutputType, "columnOutputType", 1, "下载专栏的输出格式(1pdf,2markdown,3all)")

sp = spinner.New(spinner.CharSets[4], 100*time.Millisecond)
}
Expand All @@ -69,9 +66,12 @@ var rootCmd = &cobra.Command{
if quality != "ld" && quality != "sd" && quality != "hd" {
exitWithMsg("argument 'quality' is not valid")
}
if columnOutputType <= 0 || columnOutputType >= 4 {
exitWithMsg("argument 'columnOutputType' is not valid")
}
var readCookies []*http.Cookie
if phone != "" {
rc, err := file.ReadCookieFromConfigFile(phone)
rc, err := config.ReadCookieFromConfigFile(phone)
if err != nil {
exitWithError(err)
}
Expand Down Expand Up @@ -101,7 +101,7 @@ var rootCmd = &cobra.Command{
sp.Stop()
checkGeekTimeError(err)
}
if err := file.WriteCookieToConfigFile(phone, readCookies); err != nil {
if err := config.WriteCookieToConfigFile(phone, readCookies); err != nil {
exitWithError(err)
}
sp.Stop()
Expand Down Expand Up @@ -208,7 +208,7 @@ func handleSelectArticle(ctx context.Context, articles []geektime.Article, index
}
a := articles[index-1]

projectDir, err := file.MkDownloadProjectFolder(downloadFolder, phone, gcid, products[currentProductIndex].Title)
projectDir, err := mkDownloadProjectDir(downloadFolder, phone, gcid, products[currentProductIndex].Title)
if err != nil {
exitWithError(err)
}
Expand All @@ -222,52 +222,83 @@ func handleDownloadAll(ctx context.Context) {
cTitle := products[currentProductIndex].Title
articles := loadArticles()

folder, err := file.MkDownloadProjectFolder(downloadFolder, phone, gcid, cTitle)
projectDir, err := mkDownloadProjectDir(downloadFolder, phone, gcid, cTitle)
if err != nil {
exitWithError(err)
}
downloaded, err := file.FindDownloadedArticleFileNames(folder)
downloaded, err := findDownloadedArticleFileNames(projectDir)
if err != nil {
exitWithError(err)
}
if isColumn() {
rand.Seed(time.Now().UnixNano())
fmt.Printf("正在下载专栏 《%s》 中的所有文章\n", cTitle)
total := len(articles)
var i int

chromedpCtx, cancel := chromedp.NewContext(ctx)
// start the browser
err := chromedp.Run(chromedpCtx)
if err != nil {
exitWithError(err)
var chromedpCtx context.Context
var cancel context.CancelFunc

if columnOutputType == 3 || columnOutputType == 1 {
chromedpCtx, cancel = chromedp.NewContext(ctx)
// start the browser
err := chromedp.Run(chromedpCtx)
if err != nil {
exitWithError(err)
}
defer cancel()
}
defer cancel()

for _, a := range articles {
fileName := getDownloadFileName(a)
if _, ok := downloaded[fileName]; ok {
fileName := filenamify.Filenamify(a.Title)
var b int8
_, pdfExists := downloaded[fileName+pdf.PDFExtension]
if pdfExists {
b = 1
}
_, mdExists := downloaded[fileName+markdown.MDExtension]
if mdExists {
b |= (1 << 1)
}

if b == columnOutputType {
increasePDFCount(total, &i)
continue
}
fileFullPath := filepath.Join(folder, fileName)
if err := pdf.PrintArticlePageToPDF(chromedpCtx, a.AID, fileFullPath, geektime.SiteCookies, downloadComments); err != nil {
// ensure chrome killed before os exit
cancel()

if (columnOutputType&1 == 1) && !pdfExists {
if err := pdf.PrintArticlePageToPDF(chromedpCtx,
a.AID,
projectDir,
a.Title,
geektime.SiteCookies,
downloadComments,
); err != nil {
// ensure chrome killed before os exit
cancel()
checkGeekTimeError(err)
}
}
if ((columnOutputType>>1)&1 == 1) && !mdExists {
html, err := geektime.GetColumnContent(a.AID)
checkGeekTimeError(err)
err = markdown.Download(ctx, html, a.Title, projectDir, a.AID, concurrency)
checkGeekTimeError(err)
}

increasePDFCount(total, &i)
r := rand.Intn(2000)
time.Sleep(time.Duration(r) * time.Millisecond)
}
} else if isVideo() {
for _, a := range articles {
fileName := getDownloadFileName(a)
fileName := filenamify.Filenamify(a.Title) + video.TSExtension
if _, ok := downloaded[fileName]; ok {
continue
}
videoInfo, err := geektime.GetVideoInfo(a.AID, quality)
checkGeekTimeError(err)
err = video.DownloadVideo(ctx, videoInfo.M3U8URL, fileName, folder, int64(videoInfo.Size), concurrency)
err = video.DownloadVideo(ctx, videoInfo.M3U8URL, a.Title, projectDir, int64(videoInfo.Size), concurrency)
checkGeekTimeError(err)
}
}
Expand Down Expand Up @@ -313,49 +344,48 @@ func loadArticles() []geektime.Article {
}

func downloadArticle(ctx context.Context, article geektime.Article, projectDir string) {
fileName := getDownloadFileName(article)
fileFullPath := filepath.Join(projectDir, fileName)

if isColumn() {
sp.Prefix = fmt.Sprintf("[ 正在下载 《%s》... ]", article.Title)
sp.Start()
chromedpCtx, cancel := chromedp.NewContext(ctx)
// start the browser
err := chromedp.Run(chromedpCtx)
if err != nil {
exitWithError(err)

if columnOutputType&1 == 1 {
chromedpCtx, cancel := chromedp.NewContext(ctx)
// start the browser
err := chromedp.Run(chromedpCtx)
if err != nil {
exitWithError(err)
}
defer cancel()
err = pdf.PrintArticlePageToPDF(chromedpCtx,
article.AID,
projectDir,
article.Title,
geektime.SiteCookies,
downloadComments,
)
if err != nil {
sp.Stop()
// ensure chrome killed before os exit
cancel()
checkGeekTimeError(err)
}
}
defer cancel()
err = pdf.PrintArticlePageToPDF(chromedpCtx,
article.AID,
fileFullPath,
geektime.SiteCookies,
downloadComments,
)
sp.Stop()
if err != nil {
// ensure chrome killed before os exit
cancel()

if (columnOutputType>>1)&1 == 1 {
html, err := geektime.GetColumnContent(article.AID)
checkGeekTimeError(err)
err = markdown.Download(ctx, html, article.Title, projectDir, article.AID, concurrency)
checkGeekTimeError(err)
}
sp.Stop()
} else if isVideo() {
videoInfo, err := geektime.GetVideoInfo(article.AID, quality)
checkGeekTimeError(err)
err = video.DownloadVideo(ctx, videoInfo.M3U8URL, fileName, projectDir, int64(videoInfo.Size), concurrency)
err = video.DownloadVideo(ctx, videoInfo.M3U8URL, article.Title, projectDir, int64(videoInfo.Size), concurrency)
checkGeekTimeError(err)
}
}

func getDownloadFileName(article geektime.Article) string {
var ext string
if isColumn() {
ext = PDFExtension
} else if isVideo() {
ext = TSExtension
}
return file.Filenamify(article.Title) + ext
}

func isColumn() bool {
return products[currentProductIndex].Type == "c1"
}
Expand Down Expand Up @@ -384,6 +414,34 @@ func readCookiesFromInput() []*http.Cookie {
return cookies
}

func findDownloadedArticleFileNames(projectDir string) (map[string]struct{}, error) {
files, err := ioutil.ReadDir(projectDir)
res := make(map[string]struct{}, len(files))
if err != nil {
return res, err
}
if len(files) == 0 {
return res, nil
}
for _, f := range files {
res[f.Name()] = struct{}{}
}
return res, nil
}

func mkDownloadProjectDir(downloadFolder, phone, gcid, projectName string) (string, error) {
userName := phone
if gcid != "" {
userName = gcid
}
path := filepath.Join(downloadFolder, userName, filenamify.Filenamify(projectName))
err := os.MkdirAll(path, os.ModePerm)
if err != nil {
return "", err
}
return path, nil
}

func checkGeekTimeError(err error) {
if err != nil {
if errors.Is(err, context.Canceled) {
Expand All @@ -400,7 +458,7 @@ func checkGeekTimeError(err error) {
}

fmt.Fprintln(os.Stderr, err.Error())
if err := file.RemoveConfig(phone); err != nil {
if err := config.RemoveConfig(phone); err != nil {
fmt.Fprintln(os.Stderr, err.Error())
}
os.Exit(1)
Expand All @@ -409,7 +467,7 @@ func checkGeekTimeError(err error) {
} else if _, ok := err.(*geektime.ErrGeekTimeAPIBadCode); ok {
exitWithMsg(err.Error())
} else {
// Client error, others
// others
exitWithError(err)
}
}
Expand All @@ -425,7 +483,7 @@ func checkPromptError(err error) {
}

func exitWhenClientTimeout() {
exitWithMsg("Request Timeout")
exitWithMsg("\n请求超时")
}

// Unexpected error
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ require (
)

require (
github.com/JohannesKaufmann/html-to-markdown v1.3.4 // indirect
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/VividCortex/ewma v1.1.1 // indirect
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect
github.com/fatih/color v1.13.0 // indirect
Expand Down
Loading

0 comments on commit 8c5ef03

Please sign in to comment.