From 4459d4d31665419863ef14402cee149c34077306 Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Thu, 11 Apr 2024 20:50:36 +0100 Subject: [PATCH] Update `package corpus` to stream data to the parse function. --- corpus/parse.go | 82 ++++++++++++++++++++++++++---------------------- corpus/update.go | 2 +- nrod/download.go | 74 ------------------------------------------- 3 files changed, 45 insertions(+), 113 deletions(-) delete mode 100644 nrod/download.go diff --git a/corpus/parse.go b/corpus/parse.go index f60d8a6..f52ea9f 100644 --- a/corpus/parse.go +++ b/corpus/parse.go @@ -3,6 +3,7 @@ package corpus import ( "encoding/json" "errors" + "io" "git.fjla.uk/owlboard/go-types/pkg/database" "git.fjla.uk/owlboard/timetable-mgr/log" @@ -10,50 +11,55 @@ import ( ) // Accepts CORPUS data as a byte array and formats it ready for database insertion -func parseCorpusData(jsonData *[]byte) (*[]database.CorpusEntry, error) { +func parseCorpusData(stream io.ReadCloser) (*[]database.CorpusEntry, error) { + defer stream.Close() + log.Msg.Debug("Starting CORPUS Data parsing") - // Initialise data structure - var dataMap map[string]interface{} - - // Create JSON - err := json.Unmarshal(*jsonData, &dataMap) - if err != nil { - log.Msg.Error("Unable to unmarshal CORPUS data", zap.Error(err)) - } - - corpusDataArrayInterface, ok := dataMap["TIPLOCDATA"] - if !ok { - err := errors.New("corpus Data not in expected format") - log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) - return nil, err - } - - corpusDataArray, ok := corpusDataArrayInterface.([]interface{}) - if !ok { - err := errors.New("corpus data missing the data array") - log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) - return nil, err - } - var corpusEntries []database.CorpusEntry - for _, item := range corpusDataArray { - jsonItem, err := json.Marshal(item) - if err != nil { - log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) - return nil, err - } + decoder := json.NewDecoder(stream) - var corpusEntry database.CorpusEntry - err = json.Unmarshal(jsonItem, &corpusEntry) - if err != nil { - log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) - return nil, err - } - - corpusEntries = append(corpusEntries, corpusEntry) + // Expect an object at the root of the JSON stream + if _, err := decoder.Token(); err != nil { + log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) + return nil, err } + // Search for the "TIPLOCDATA" key + for decoder.More() { + // Decode the next JSON token + if tok, err := decoder.Token(); err != nil { + log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) + return nil, err + } else if tok == "TIPLOCDATA" { + // Found the "TIPLOCDATA" key, expect the associated array + if !decoder.More() { + err := errors.New("missing array after TIPLOCDATA key") + log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) + return nil, err + } + + // Start reading the array associated with the "TIPLOCDATA" key + if _, err := decoder.Token(); err != nil { + log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) + return nil, err + } + + // Iterate over the JSON array + for decoder.More() { + var corpusEntry database.CorpusEntry + if err := decoder.Decode(&corpusEntry); err != nil { + log.Msg.Error("Error parsing CORPUS Data", zap.Error(err)) + return nil, err + } + corpusEntries = append(corpusEntries, corpusEntry) + } + break // Exit loop after processing "TIPLOCDATA" array + } + } + + log.Msg.Debug("CORPUS parsing complete") + return &corpusEntries, nil } diff --git a/corpus/update.go b/corpus/update.go index 0b2b9eb..92db28f 100644 --- a/corpus/update.go +++ b/corpus/update.go @@ -10,7 +10,7 @@ import ( // Runs all stages of the CORPUS Update process func RunCorpusUpdate(cfg *helpers.Configuration) error { - resp, err := nrod.NrodDownload(url, cfg) + resp, err := nrod.NrodStream(url, cfg) if err != nil { log.Msg.Error("Failed to fetch CORPUS data", zap.Error(err)) return err diff --git a/nrod/download.go b/nrod/download.go deleted file mode 100644 index 4a8d697..0000000 --- a/nrod/download.go +++ /dev/null @@ -1,74 +0,0 @@ -package nrod - -import ( - "compress/gzip" - "fmt" - "io" - "net/http" - "time" - - "git.fjla.uk/owlboard/timetable-mgr/helpers" - "git.fjla.uk/owlboard/timetable-mgr/log" - "go.uber.org/zap" -) - -// Downloads NROD Data over HTTP from the given URL, extracted data is returned -func NrodDownload(url string, cfg *helpers.Configuration) (*[]byte, error) { - log.Msg.Debug("Fetching NROD data", zap.String("Request URL", url)) - client := http.Client{ - Timeout: time.Second * 10, - } - - req, err := http.NewRequest("GET", url, nil) - if err != nil { - log.Msg.Error("Error creating HTTP Request", zap.String("Request URL", url), zap.Error(err)) - return nil, err - } - req.Header.Add("Authorization", "Basic "+helpers.BasicAuth(cfg.NrodUser, cfg.NrodPass)) - - resp, err := client.Do(req) - if err != nil { - log.Msg.Error("Error carrying out HTTP Request", zap.String("Request URL", url), zap.Error(err)) - return nil, err - } - - if resp.StatusCode != http.StatusOK { - err := fmt.Errorf("unexpected status code: %d", resp.StatusCode) - log.Msg.Error("Non-Successful status code from http response", zap.String("Request URL", url), zap.Error(err)) - return nil, err - } - - // Yes, I know `readedData` is not proper English. But readData reads more like a verb action. - readedData, err := nrodExtract(resp) - if err != nil { - log.Msg.Error("Unable to read response data") - return nil, err - } - - return readedData, nil -} - -// Extracts GZIP Data from an HTTP Response and returns the decompresses data as a byte array -func nrodExtract(resp *http.Response) (*[]byte, error) { - log.Msg.Debug("Extracting HTTP Response Data") - gzReader, err := gzip.NewReader(resp.Body) - if err != nil { - log.Msg.Warn("Unable to create GZIP Reader, data probably not GZIPPED") - data, err := io.ReadAll(resp.Body) - if err != nil { - log.Msg.Error("Unable to read response body") - return nil, err - } - return &data, nil - } - - defer gzReader.Close() - - log.Msg.Debug("GZIP Reader Opened") - extractedData, err := io.ReadAll(gzReader) - if err != nil { - log.Msg.Error("Failed to read GZIPped data", zap.Error(err)) - } - - return &extractedData, nil -}