Refactor EML file parsing and header extraction

We can no parse simple mails (multipart is not working yet). The existing implementation was made more efficient by refactoring the EML file parsing and header extraction mechanism. Added 'strings' and 'bytes' packages to facilitate these changes. Previously, headers and body were parsed separately which was unnecessarily complex and increased the chance of errors. Now, with the new function 'readEML' and the helper function 'parseEMLBodyParts', we are able to parse headers and body together which not only simplifies the code but also increases its reliability. Specifically, 'bytes.Buffer' now helps us capture body while parsing, which removes need for separate handling. Additionally, certain headers like 'charset' and body types are also accounted for in the new implementation, enhancing the completeness of information extracted from EML files.
This commit is contained in:
Winni Neessen 2023-10-13 15:06:28 +02:00
parent 3d50370a4c
commit f60b689b03
Signed by: wneessen
GPG key ID: 5F3AF39B820C119D

60
eml.go
View file

@ -1,11 +1,13 @@
package mail package mail
import ( import (
"bytes"
"errors" "errors"
"fmt" "fmt"
"mime" "mime"
nm "net/mail" nm "net/mail"
"os" "os"
"strings"
) )
// EMLToMsg will open an parse a .eml file at a provided file path and return a // EMLToMsg will open an parse a .eml file at a provided file path and return a
@ -18,44 +20,40 @@ func EMLToMsg(fp string) (*Msg, error) {
mimever: Mime10, mimever: Mime10,
} }
pm, err := readEML(fp) pm, mbbuf, err := readEML(fp)
if err != nil || pm == nil { if err != nil || pm == nil {
return m, fmt.Errorf("failed to parse EML file: %w", err) return m, fmt.Errorf("failed to parse EML file: %w", err)
} }
// Parse the header
if err := parseEMLHeaders(&pm.Header, m); err != nil { if err := parseEMLHeaders(&pm.Header, m); err != nil {
return m, fmt.Errorf("failed to parse EML headers: %w", err) return m, fmt.Errorf("failed to parse EML headers: %w", err)
} }
if err := parseEMLBodyParts(pm, mbbuf, m); err != nil {
// Extract the transfer encoding of the body return m, fmt.Errorf("failed to parse EML body parts: %w", err)
mi, ar, err := mime.ParseMediaType(pm.Header.Get(HeaderContentType.String()))
if err != nil {
return m, fmt.Errorf("failed to extract content type: %w", err)
} }
if v, ok := ar["charset"]; ok {
m.SetCharset(Charset(v))
}
fmt.Printf("Encoding: %s\n", mi)
fmt.Printf("Params: %+v\n", ar)
return m, nil return m, nil
} }
// readEML opens an EML file and uses net/mail to parse the header and body // readEML opens an EML file and uses net/mail to parse the header and body
func readEML(fp string) (*nm.Message, error) { func readEML(fp string) (*nm.Message, *bytes.Buffer, error) {
fh, err := os.Open(fp) fh, err := os.Open(fp)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to open EML file: %w", err) return nil, nil, fmt.Errorf("failed to open EML file: %w", err)
} }
defer func() { defer func() {
_ = fh.Close() _ = fh.Close()
}() }()
pm, err := nm.ReadMessage(fh) pm, err := nm.ReadMessage(fh)
if err != nil { if err != nil {
return pm, fmt.Errorf("failed to parse EML: %w", err) return pm, nil, fmt.Errorf("failed to parse EML: %w", err)
} }
return pm, nil
buf := bytes.Buffer{}
if _, err = buf.ReadFrom(pm.Body); err != nil {
return nil, nil, err
}
return pm, &buf, nil
} }
// parseEMLHeaders will check the EML headers for the most common headers and set the // parseEMLHeaders will check the EML headers for the most common headers and set the
@ -64,8 +62,8 @@ func parseEMLHeaders(mh *nm.Header, m *Msg) error {
commonHeaders := []Header{ commonHeaders := []Header{
HeaderContentType, HeaderImportance, HeaderInReplyTo, HeaderListUnsubscribe, HeaderContentType, HeaderImportance, HeaderInReplyTo, HeaderListUnsubscribe,
HeaderListUnsubscribePost, HeaderMessageID, HeaderMIMEVersion, HeaderOrganization, HeaderListUnsubscribePost, HeaderMessageID, HeaderMIMEVersion, HeaderOrganization,
HeaderPrecedence, HeaderPriority, HeaderSubject, HeaderUserAgent, HeaderXMailer, HeaderPrecedence, HeaderPriority, HeaderReferences, HeaderSubject, HeaderUserAgent,
HeaderXMSMailPriority, HeaderXPriority, HeaderXMailer, HeaderXMSMailPriority, HeaderXPriority,
} }
// Extract address headers // Extract address headers
@ -118,3 +116,29 @@ func parseEMLHeaders(mh *nm.Header, m *Msg) error {
return nil return nil
} }
// parseEMLBodyParts ...
func parseEMLBodyParts(pm *nm.Message, mbbuf *bytes.Buffer, m *Msg) error {
// Extract the transfer encoding of the body
mt, par, err := mime.ParseMediaType(pm.Header.Get(HeaderContentType.String()))
if err != nil {
return fmt.Errorf("failed to extract content type: %w", err)
}
if v, ok := par["charset"]; ok {
m.SetCharset(Charset(v))
}
if cte := pm.Header.Get(HeaderContentTransferEnc.String()); cte != "" {
switch strings.ToLower(cte) {
case NoEncoding.String():
m.SetEncoding(NoEncoding)
}
}
switch strings.ToLower(mt) {
case TypeTextPlain.String():
m.SetBodyString(TypeTextPlain, mbbuf.String())
default:
}
return nil
}